├── .github └── workflows │ ├── conda-build.yml │ ├── python-app.yml │ ├── python-package.yml │ └── python-publish.yml ├── .gitignore ├── CITATION.cff ├── LICENSE ├── MANIFEST.in ├── README.md ├── benchmarks ├── README.md ├── ibaqpy-batch-correction-example.html ├── ibaqpy-batch-correction-example.ipynb └── images │ ├── 9_tissues-boxplot.png │ ├── 9_tissues-density.png │ ├── PXD007683-11samples-density.png │ ├── PXD007683-LFQ-11samples-ibaq-ibaqpy-and-maxquant.png │ ├── PXD007683-LFQ-11samples-ibaq-vs-maxquant-density.png │ ├── PXD007683-LFQ-11samples-no_cov.png │ ├── PXD007683-LFQ-ibaq-ibaqpy-and-maxquant.png │ ├── PXD007683-LFQ-ibaq-vs-maxquant-density.png │ ├── PXD007683-LFQ-no_cov.png │ ├── PXD007683-TMTvsLFQ-boxplot.png │ ├── PXD007683-TMTvsLFQ-density.png │ ├── PXD019909-11samples-density.png │ ├── PXD019909-TMTvsLFQ-density.png │ ├── fold_change_lfq.png │ ├── fold_change_tmt.png │ ├── method_mean_cv_016999_lfq.png │ ├── method_mean_cv_lfq.png │ ├── method_mean_cv_tmt.png │ ├── method_per_p_cv_016999_lfq.png │ ├── method_per_p_cv_lfq.png │ ├── method_per_p_cv_tmt.png │ ├── missing_peptides_by_sample.png │ ├── missing_value_016999_lfq.png │ └── per_protein_cv.png ├── data ├── __init__.py ├── contaminants_ids.tsv ├── high_abundant_proteins.tsv ├── histones.json ├── ibaqpy.drawio └── ibaqpy.drawio.png ├── environment.yaml ├── ibaqpy ├── __init__.py ├── commands │ ├── __init__.py │ ├── correct_batches.py │ ├── features2peptides.py │ ├── peptides2protein.py │ └── tsne_visualization.py ├── data │ ├── __init__.py │ ├── data.py │ └── organisms.json ├── ibaq │ ├── __init__.py │ ├── combiner.py │ ├── file_utils.py │ ├── ibaqpy_commons.py │ ├── ibaqpy_postprocessing.py │ ├── imputation_methods.py │ ├── logger.py │ ├── logging_config.py │ ├── peptide_normalization.py │ ├── peptides2protein.py │ ├── utils.py │ └── write_queue.py ├── ibaqpyc.py └── model │ ├── __init__.py │ ├── normalization.py │ ├── organism_metadata.py │ └── quantification_type.py ├── pyproject.toml ├── qodana.yaml ├── recipe ├── conda_build_config.yaml └── meta.yaml ├── requirements.txt └── tests ├── __init__.py ├── example ├── Homo-sapiens-uniprot-reviewed-contaminants-decoy-202210.fasta ├── PXD017834-TMT.sdrf.tsv ├── PXD017834-example-ibaq.tsv ├── PXD017834-peptides.csv ├── feature.parquet └── out │ └── .gitignore ├── ibaq-raw-hela ├── PXD000396.ibaq.tsv ├── PXD005481.ibaq.tsv └── PXD039414.ibaq.tsv ├── test_batch_correction.py ├── test_file_utils.py ├── test_ibaqpy.py ├── test_ibaqpy_postprocessing.py └── test_peptide_normalize.py /.github/workflows/conda-build.yml: -------------------------------------------------------------------------------- 1 | name: Conda Build 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | defaults: 9 | run: 10 | shell: bash -el {0} 11 | steps: 12 | - name: Checkout repository 13 | uses: actions/checkout@v4 # Update to latest version 14 | 15 | - name: Set up Miniconda 16 | uses: conda-incubator/setup-miniconda@v3 17 | with: 18 | mamba-version: "*" 19 | channels: conda-forge,bioconda 20 | cache-downloads: true 21 | auto-update-conda: false 22 | activate-environment: test 23 | python-version: "3.12" 24 | 25 | - name: Setup conda-build and anaconda-client 26 | run: | 27 | mamba install -q conda-build anaconda-client conda-verify 28 | 29 | - name: Build package 30 | run: | 31 | conda build purge-all 32 | conda config --set solver libmamba 33 | conda config --set channel_priority strict 34 | conda build recipe --suppress-variables --override-channels --channel conda-forge --channel bioconda --no-anaconda-upload --output-folder ./ 35 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python application 5 | 6 | on: 7 | push: 8 | branches: ["main", "dev"] 9 | pull_request: 10 | branches: ["main", "dev"] 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | build: 17 | runs-on: ubuntu-latest 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | python-version: ["3.9", "3.10", "3.11"] 22 | 23 | steps: 24 | - uses: actions/checkout@v4 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | python -m pip install flake8 pytest 33 | pip install poetry 34 | poetry build 35 | pip install dist/*.whl 36 | - name: Lint with flake8 37 | run: | 38 | # stop the build if there are Python syntax errors or undefined names 39 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 40 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 41 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 42 | - name: Test with pytest 43 | run: | 44 | poetry run pytest 45 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: ["main", "dev"] 9 | pull_request: 10 | branches: ["main", "dev"] 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | python-version: ["3.9", "3.10", "3.11"] 19 | 20 | steps: 21 | - uses: actions/checkout@v4 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v3 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | python -m pip install flake8 pytest 30 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 31 | pip install poetry 32 | poetry build 33 | pip install dist/*.whl 34 | - name: Lint with flake8 35 | run: | 36 | # stop the build if there are Python syntax errors or undefined names 37 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 38 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 39 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 40 | - name: Test with pytest 41 | run: | 42 | pytest 43 | - name: Test commandline tool 44 | run: | 45 | ibaqpyc --help 46 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | runs-on: ubuntu-latest 21 | 22 | steps: 23 | - uses: actions/checkout@v4 24 | - name: Set up Python 25 | uses: actions/setup-python@v3 26 | with: 27 | python-version: "3.x" 28 | - name: Install dependencies 29 | run: | 30 | python -m pip install --upgrade pip 31 | pip install build 32 | - name: Build package 33 | run: python -m build 34 | - name: Publish package 35 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 36 | with: 37 | user: __token__ 38 | password: ${{ secrets.PYPI_API_TOKEN }} 39 | verbose: true 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | res.csv 3 | /data/PXD004682-Peptide-Intensities.tsv 4 | /data/PXD008934.sdrf.tsv.gz 5 | /data/PXD008934-out.mzTab.gz 6 | /data/PXD008934-out_msstats.csv.gz 7 | /data/PXD008934-out_triqler.tsv.gz 8 | venv 9 | /venv/ 10 | /compute-all.sh 11 | /ibaqpy.egg-info/ 12 | /ibaqpy_temp/ 13 | /tests/PXD003947/IBAQ-QCprofile.pdf 14 | /tests/PXD003947/PXD003947-ibaq-norm.csv 15 | /tests/PXD003947/PXD003947-peptides-norm.csv 16 | /tests/PXD003947/PXD003947-peptides-norm.parquet 17 | /build/ 18 | /dist/ 19 | /**/__pycache__/ 20 | .qodo 21 | /.vscode/ 22 | /tests/example/ibaq_corrected_combined.h5ad 23 | /tests/example/ibaq_corrected_combined.tsv 24 | /tests/example/PXD017834-ibaq.tsv 25 | /tests/example/PXD017834-peptides-norm.csv 26 | /tests/example/PXD017834-peptides-norm.parquet 27 | /tests/example/QCprofile.pdf 28 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use ibaqpy in your research, please cite this work." 3 | title: "ibaqpy: A scalable Python package for baseline quantification in proteomics leveraging SDRF metadata" 4 | authors: 5 | - family-names: "Zheng" 6 | given-names: "Ping" 7 | - family-names: "Audain" 8 | given-names: "Enrique" 9 | - family-names: "Webel" 10 | given-names: "Henry" 11 | - family-names: "Dai" 12 | given-names: "Chengxin" 13 | - family-names: "Klein" 14 | given-names: "Joshua" 15 | - family-names: "Hitz" 16 | given-names: "Marc-Phillip" 17 | - family-names: "Sachsenberg" 18 | given-names: "Timo" 19 | - family-names: "Bai" 20 | given-names: "Mingze" 21 | - family-names: "Perez-Riverol" 22 | given-names: "Yasset" 23 | abstract: "Intensity-based absolute quantification (iBAQ) is essential in proteomics as it allows for the assessment of a protein's absolute abundance in various samples or conditions. However, the computation of these values for increasingly large-scale and high-throughput experiments, such as those using DIA, TMT, or LFQ workflows, poses significant challenges in scalability and reproducibility. Here, we present ibaqpy, a Python package designed to compute iBAQ values efficiently for experiments of any scale." 24 | date-released: "2025-02-08" 25 | doi: "10.1101/2025.02.08.637208" 26 | url: "https://www.biorxiv.org/content/early/2025/02/08/2025.02.08.637208" 27 | journal: "bioRxiv" 28 | publisher: "Cold Spring Harbor Laboratory" 29 | version: "2025.02.08.637208" 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 BigBio Team 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include ibaqpy/data/ * -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ibaqpy 2 | 3 | [![Python application](https://github.com/bigbio/ibaqpy/actions/workflows/python-app.yml/badge.svg)](https://github.com/bigbio/ibaqpy/actions/workflows/python-app.yml) 4 | [![Upload Python Package](https://github.com/bigbio/ibaqpy/actions/workflows/python-publish.yml/badge.svg)](https://github.com/bigbio/ibaqpy/actions/workflows/python-publish.yml) 5 | [![Codacy Badge](https://app.codacy.com/project/badge/Grade/6a1961c7d57c4225b4891f73d58cac6b)](https://app.codacy.com/gh/bigbio/ibaqpy/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) 6 | [![PyPI version](https://badge.fury.io/py/ibaqpy.svg)](https://badge.fury.io/py/ibaqpy) 7 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/ibaqpy) 8 | 9 | iBAQ (Intensity-Based Absolute Quantification) determines the abundance of a protein by dividing the total precursor intensities by the number of theoretically observable peptides of the protein [manuscript here](https://pubmed.ncbi.nlm.nih.gov/16219938/). ibaqpy is a Python package that computes iBAQ values starting from a feature parquet from [quantmsio](https://github.com/bigbio/quantms.io) and a [SDRF](https://github.com/bigbio/proteomics-sample-metadata) file. In addition, the package computes other ibaq values including rIBAQ, log2, and ppb. 10 | 11 | ibaqpy also allows computing the TPA value (Total Protein Approach), protein copy number, and protein concentration. TPA is determined by summing peptide intensities of each protein and then dividing by the molecular mass to determine the relative concentration of each protein. By using [ProteomicRuler](https://www.sciencedirect.com/science/article/pii/S1535947620337749), it is possible to calculate the protein copy number and absolute concentration. The OpenMS tool was used to calculate the theoretical molecular mass of each protein. Similar to the calculation of IBAQ, the TPA value of protein-group was the sum of its intensity divided by the sum of the theoretical molecular mass. 12 | 13 | The protein copy calculation follows the following formula: 14 | 15 | ``` 16 | protein copies per cell = protein MS-signal * (avogadro / molecular mass) * (DNA mass / histone MS-signal) 17 | ``` 18 | 19 | For cellular protein copy number calculation, the uniprot accession of histones was obtained from species first, and the molecular mass of DNA was calculated. Then the dataframe was grouped according to different conditions, and the copy number, molar number and mass of proteins were calculated. In the calculation of protein concentration, the volume is calculated according to the cell protein concentration first, and then the protein mass is divided by the volume to calculate the intracellular protein concentration. 20 | 21 | ### Overview of ibaq-base values computation 22 | 23 | As mentioned before, ibaq values are calculated by dividing the total precursor intensities by the number of theoretically observable peptides of the protein. We use the following steps to calculate the iBAQ values: 24 | 25 | - _Observable peptides_, the protein sequence is digested in silico using a specific enzyme. The current version of this tool uses OpenMS method to load fasta file, and use [ProteaseDigestion](https://openms.de/current_doxygen/html/classOpenMS_1_1ProteaseDigestion.html) to enzyme digestion of protein sequences, and finally get the theoretical peptide number of each protein. 26 | 27 | - _Total precursor intensities_, the total intensity of a protein is calculated by summing the intensity of all peptides that belong to the protein. The intensity values are obtained from the feature parquet file in [quantms.io](https://github.com/bigbio/quantms.io). 28 | 29 | > Note: If protein-group exists in the peptide intensity dataframe, the intensity of all proteins in the protein-group is summed based on the above steps, and then divided by the number of proteins in the protein-group. 30 | 31 | ### Other values 32 | 33 | - `Ibaq` - the iBAQ value is calculated as `Total precursor intensities / Number of observable peptides` 34 | 35 | - `IbaqNorm` - normalize the ibaq values using the total ibaq of the sample `ibaq / sum(ibaq)`, the sum is applied for proteins in the same _sample + condition_. 36 | 37 | - `IbaqLog` - The ibaq log is calculated as `10 + log10(IbaqNorm)`. This normalized ibaq value was developed [by ProteomicsDB Team](https://academic.oup.com/nar/article/46/D1/D1271/4584631). 38 | 39 | - `IbaqPpb` - The resulted IbaqNorm is multiplied by 100M `IbaqNorm * 100'000'000`. This method was developed originally [by PRIDE Team](https://www.nature.com/articles/s41597-021-00890-2). 40 | 41 | - `IbaqBec` - Ibaq after Batch effect correction using combat-norm algorithm in inmoose package. 42 | 43 | - `TPA` - TPA value is calculated as `NormIntensity / MolecularWeight` 44 | 45 | - `CopyNumber` - Protein copy number is calculated by a proteomic ruler approach. 46 | 47 | - `Concentration[nM]` - Protein concentration is calculated using the total weight and a provided concentration per cell (cpc). 48 | 49 | ### From quantms to Ibaq values 50 | 51 | ![Ibaq](./data/ibaqpy.drawio.png "IBAQ") 52 | 53 | The output of quantms is converted into quantms.io feature file. quantms.io provides a unified format for processing report files, including peptide intensity information. In quantms.io, you can use the `convert-ibaq` command, providing a **feature file** and an **SDRF file**, to inject experimental information into the feature file, generating an ibaqpy use case. 54 | 55 | ```asciidoc 56 | >$ quantmsioc convert-feature --sdrf_file PXD004452-Hella-trypsin.sdrf.tsv --msstats_file PXD004452-Hella-trypsin.sdrf_openms_design_msstats_in.csv --mztab_file PXD004452-Hella-trypsin.sdrf_openms_design_openms.mzTab --file_num 30 --output_folder res --duckdb_max_memory 64GB --output_prefix_file PXD004452 57 | >$ quantmsioc convert-ibaq --feature_file res/PXD004452-6c224f5a-7c1f-46f9-9dae-1541baeef8fe.feature.parquet --sdrf_file PXD004452-Hella-trypsin.sdrf.tsv --output_folder ibaq --output_prefix_file PXD004452 58 | ``` 59 | 60 | A feature in quantms.io is the combination of the following columns: 61 | 62 | - `ProteinName`: Protein name 63 | - `Peptidoform`: Peptide sequence including post-translation modifications `(e.g. .(Acetyl)ASPDWGYDDKN(Deamidated)GPEQWSK)` 64 | - `PEPTIDE_CANONICAL`: Canonical peptide sequence 65 | - `PrecursorCharge`: Precursor charge 66 | - `Channel`: Lable channel 67 | - `Condition`: Condition label `(e.g. heart)` 68 | - `BioReplicate`: Biological replicate index `(e.g. 1)` 69 | - `Run`: Run index `(e.g. 1)` 70 | - `Fraction`: Fraction index `(e.g. 1)` 71 | - `Intensity`: Peptide intensity 72 | - `Reference`: reference file 73 | - `SampleID`: Sample ID `(e.g. PXD003947-Sample-3)` 74 | 75 | In summary, each feature is the unique combination of a peptide sequence including modifications (peptidoform), precursor charge state, condition, biological replicate, run, fraction, reference_file_name, sample_accession, and a given intensity. In order to go from these features into protein ibaq values, the package does the following: 76 | 77 | #### Data preprocessing 78 | 79 | In this section`features2peptides`, ibaqpy will do: 80 | 81 | - Parse the identifier of proteins and retain only unique peptides. 82 | - Remove lines where intensity or study condition is empty: This could happen in the following cases: 83 | - The DIA pipeline sometimes for some features releases intensities with value 0. 84 | - The quantms.io do not contain feature information for some conditions. This extreme case could happen when not ID/Quant was found for a given condition during the analysis. 85 | - Filter peptides with less amino acids than min_aa. 86 | - Low-confidence proteins were removed according to the threshold of unique peptides: We use a thershold of 2 unique peptides to consider a protein as a high-confidence protein. This parameter is applied if not specified by the user, and the default value is 2. If users want to change this threshold, they can use the `--min_unique` parameter. 87 | - Filter decoy, contaminants, entrapment: Proteins with the following prefix are removed by default: `DECOY, CONTAMINANT, ENTRAPMENT` could be removed, by default, the filter is not applied. If users want to remove these proteins, they can use the `--remove_decoy_contaminants` parameter. 88 | - Filter user-specified proteins: The user can provide a list of protein identifiers to remove from the analysis using the `--remove_ids` parameter. The remove ids parameters will remove proteins from the analysis that could be potential to influence the intensity normalization. For example, ALBU_HUMAN could be over expressed in human tissues, and that is why we may want to remove it when analyzing tissue data. 89 | - Normalize at feature level between ms runs (technical repetitions): 90 | - When `MS runs > 1` in the sample, the `mean` of all average(`mean`, `median` or `iqr`) in each MS run is calculated(SampleMean) 91 | - The ratio between SampleMean and the average MS run is used as a reference to scale the original intensity 92 | - Merge peptidoforms across fractions and technical repetitions: Combine technical replicates and fragments from the same sample. 93 | - Normalize the data at the sample level: 94 | - `globalMedian`: A global median that adjusts the median of all samples. 95 | - `conditionMedian`: All samples under the same conditions were adjusted to the median value under the current conditions. 96 | - Remove peptides with low frequency if `sample number > 1`: This parameter is applied always unless the user specifies the `--remove_low_frequency_peptides` parameter. The default value is 20% of the samples. If users want to change this threshold, they can use the `--remove_low_frequency_peptides` parameter. 97 | - Assembly peptidoforms to peptides: 98 | A peptidoform is a combination of a `PeptideSequence(Modifications) + Charge + BioReplicate + Fraction` (among other features), and a peptide is a combination of a `PeptideSequence(Canonical) + BioReplicate`. ibaqpy will do: 99 | - Select peptidoforms with the highest intensity across different modifications, fractions, and technical replicates 100 | - Merge peptidoforms across different charges and combined into peptides. In order to merge peptidoforms, the package will applied the `sum` of the intensity values of the peptidoforms. 101 | - Intensity transformation to log: The user can specify the `--log2` parameter to transform the peptide intensity values to log2 before normalization. 102 | 103 | > Note: At the moment, ibaqpy computes the ibaq values only based on unique peptides. Shared peptides are discarded. However, if a group of proteins share the same unique peptides (e.g., Pep1 -> Prot1;Prot2 and Pep2 -> Prot1;Prot2), the intensity of the proteins is summed and divided by the number of proteins in the group. 104 | 105 | #### Calculate the IBAQ Value 106 | 107 | First, peptide intensity dataframe was grouped according to protein name, sample name and condition. The protein intensity of each group was summed up. Due to the experimental type, the same protein may exhibit missing peptides in different samples, resulting in variations in the number of peptides detected for the protein across different samples. To handle this difference, normalization within the same group can be achieved by using the formula `sum(peptides) / n`(n represents the number of detected peptide segments). Finally, the normalized intensity of the protein is divided by the number of theoretical peptides.See details in `peptides2proteins`. 108 | 109 | > Note: In all scripts and result files, _uniprot accession_ is used as the protein identifier. 110 | 111 | ### How to install ibaqpy 112 | 113 | Ibaqpy is available in PyPI and can be installed using pip: 114 | 115 | ```asciidoc 116 | pip install ibaqpy 117 | ``` 118 | 119 | You can install the package from code: 120 | 121 | 1. Clone the repository: 122 | 123 | ```asciidoc 124 | >$ git clone https://github.com/bigbio/ibaqpy 125 | >$ cd ibaqpy 126 | ``` 127 | 128 | 2. Install conda environment: 129 | 130 | ```asciidoc 131 | >$ mamba env create -f conda-environment.yaml 132 | ``` 133 | 134 | 3. Install ibaqpy: 135 | 136 | ```asciidoc 137 | >$ python setup.py install 138 | ``` 139 | 140 | ### Collecting intensity files from quantms.org 141 | 142 | Absolute quantification files have been stored in the following url: 143 | 144 | ``` 145 | https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/absolute-expression/quantms-data/ 146 | ``` 147 | 148 | Inside each project reanalysis folder, the folder proteomicslfq contains the msstats input file with the structure `{Name of the project}.{Random uuid}.feature.parquet `. 149 | 150 | E.g. http://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/absolute-expression/quantms-data/MSV000079033.1/MSV000079033.1-bd44c7e3-654c-444d-9e21-0f701d6dac94.feature.parquet 151 | 152 | ### Major commands 153 | 154 | #### Features to peptides 155 | 156 | ```asciidoc 157 | ibaqpy features2peptides -p tests/PXD003947/PXD003947-feature.parquet -s tests/PXD003947/PXD003947.sdrf.tsv --remove_ids data/contaminants_ids.tsv --remove_decoy_contaminants --remove_low_frequency_peptides --output tests/PXD003947/PXD003947-peptides-norm.csv 158 | ``` 159 | 160 | ```asciidoc 161 | Usage: features2peptides.py [OPTIONS] 162 | 163 | Options: 164 | -p, --parquet TEXT Parquet file import generated by quantms.io 165 | -s, --sdrf TEXT SDRF file import generated by quantms 166 | --min_aa INTEGER Minimum number of amino acids to filter 167 | peptides 168 | --min_unique INTEGER Minimum number of unique peptides to filter 169 | proteins 170 | --remove_ids TEXT Remove specific protein ids from the 171 | analysis using a file with one id per line 172 | --remove_decoy_contaminants Remove decoy and contaminants proteins from 173 | the analysis 174 | --remove_low_frequency_peptides 175 | Remove peptides that are present in less 176 | than 20% of the samples 177 | --output TEXT Peptide intensity file including other all 178 | properties for normalization 179 | --skip_normalization Skip normalization step 180 | --nmethod TEXT Normalization method used to normalize 181 | feature intensities for tec 182 | (options: mean, median, iqr, none) 183 | --pnmethod TEXT Normalization method used to normalize 184 | peptides intensities for all samples 185 | (options: globalMedian,conditionMedian,none) 186 | --log2 Transform to log2 the peptide intensity 187 | values before normalization 188 | --save_parquet Save normalized peptides to parquet 189 | --help Show this message and exit. 190 | ``` 191 | 192 | #### Compute IBAQ/TPA 193 | 194 | ```asciidoc 195 | ibaqpy peptides2protein -f Homo-sapiens-uniprot-reviewed-contaminants-decoy-202210.fasta -p PXD017834-peptides.csv -e Trypsin -n -t -r --ploidy 2 --cpc 200 --organism human --output PXD003947.tsv --verbose 196 | ``` 197 | 198 | ```asciidoc 199 | Usage: peptides2protein [OPTIONS] 200 | 201 | Options: 202 | -f, --fasta TEXT Protein database to compute IBAQ values 203 | -p, --peptides TEXT Peptide identifications with intensities following the 204 | peptide intensity output 205 | -e, --enzyme TEXT Enzyme used during the analysis of the dataset 206 | (default: Trypsin) 207 | -n, --normalize Normalize IBAQ values using by using the total IBAQ of 208 | the experiment 209 | --min_aa INTEGER Minimum number of amino acids to consider a peptide 210 | --max_aa INTEGER Maximum number of amino acids to consider a peptide 211 | -t, --tpa Whether calculate TPA (is_flag=True) 212 | -r, --ruler Whether to use ProteomicRuler (is_flag=True) 213 | -i, --ploidy Ploidy number (default=2) 214 | -m, --organism Organism source of the data (default human) 215 | -c, --cpcCellular protein concentration(g/L)(default 200) 216 | -o, --output TEXT Output file with the proteins and ibaq values 217 | --verbose Print addition information about the distributions of 218 | the intensities, number of peptides remove after 219 | normalization, etc. 220 | --qc_report TEXT PDF file to store multiple QC images 221 | --help Show this message and exit. 222 | ``` 223 | 224 | ### Citation 225 | 226 | > Zheng P, Audain E, Webel H, Dai C, Klein J, Hitz MP, Sachsenberg T, Bai M, Perez-Riverol Y. ibaqpy: A scalable Python package for baseline quantification in proteomics leveraging SDRF metadata. bioRxiv 2025.02.08.637208; doi: https://doi.org/10.1101/2025.02.08.637208 227 | 228 | Other relevant publications: 229 | 230 | > Wang H, Dai C, Pfeuffer J, Sachsenberg T, Sanchez A, Bai M, Perez-Riverol Y. Tissue-based absolute quantification using large-scale TMT and LFQ experiments. Proteomics. 2023 Oct;23(20):e2300188. doi: [10.1002/pmic.202300188](https://analyticalsciencejournals.onlinelibrary.wiley.com/doi/10.1002/pmic.202300188). Epub 2023 Jul 24. PMID: 37488995. 231 | 232 | ### Credits 233 | 234 | - [Julianus Pfeuffer](@jpfeuffer) 235 | - [Yasset Perez-Riverol](@ypriverol) 236 | - [Hong Wang](@WangHong007) 237 | - [Ping Zheng](@zprobot) 238 | - [Joshua Klein](@mobiusklein) 239 | - [Enrique Audain](@enriquea) 240 | -------------------------------------------------------------------------------- /benchmarks/images/9_tissues-boxplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/9_tissues-boxplot.png -------------------------------------------------------------------------------- /benchmarks/images/9_tissues-density.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/9_tissues-density.png -------------------------------------------------------------------------------- /benchmarks/images/PXD007683-11samples-density.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD007683-11samples-density.png -------------------------------------------------------------------------------- /benchmarks/images/PXD007683-LFQ-11samples-ibaq-ibaqpy-and-maxquant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD007683-LFQ-11samples-ibaq-ibaqpy-and-maxquant.png -------------------------------------------------------------------------------- /benchmarks/images/PXD007683-LFQ-11samples-ibaq-vs-maxquant-density.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD007683-LFQ-11samples-ibaq-vs-maxquant-density.png -------------------------------------------------------------------------------- /benchmarks/images/PXD007683-LFQ-11samples-no_cov.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD007683-LFQ-11samples-no_cov.png -------------------------------------------------------------------------------- /benchmarks/images/PXD007683-LFQ-ibaq-ibaqpy-and-maxquant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD007683-LFQ-ibaq-ibaqpy-and-maxquant.png -------------------------------------------------------------------------------- /benchmarks/images/PXD007683-LFQ-ibaq-vs-maxquant-density.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD007683-LFQ-ibaq-vs-maxquant-density.png -------------------------------------------------------------------------------- /benchmarks/images/PXD007683-LFQ-no_cov.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD007683-LFQ-no_cov.png -------------------------------------------------------------------------------- /benchmarks/images/PXD007683-TMTvsLFQ-boxplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD007683-TMTvsLFQ-boxplot.png -------------------------------------------------------------------------------- /benchmarks/images/PXD007683-TMTvsLFQ-density.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD007683-TMTvsLFQ-density.png -------------------------------------------------------------------------------- /benchmarks/images/PXD019909-11samples-density.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD019909-11samples-density.png -------------------------------------------------------------------------------- /benchmarks/images/PXD019909-TMTvsLFQ-density.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD019909-TMTvsLFQ-density.png -------------------------------------------------------------------------------- /benchmarks/images/fold_change_lfq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/fold_change_lfq.png -------------------------------------------------------------------------------- /benchmarks/images/fold_change_tmt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/fold_change_tmt.png -------------------------------------------------------------------------------- /benchmarks/images/method_mean_cv_016999_lfq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/method_mean_cv_016999_lfq.png -------------------------------------------------------------------------------- /benchmarks/images/method_mean_cv_lfq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/method_mean_cv_lfq.png -------------------------------------------------------------------------------- /benchmarks/images/method_mean_cv_tmt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/method_mean_cv_tmt.png -------------------------------------------------------------------------------- /benchmarks/images/method_per_p_cv_016999_lfq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/method_per_p_cv_016999_lfq.png -------------------------------------------------------------------------------- /benchmarks/images/method_per_p_cv_lfq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/method_per_p_cv_lfq.png -------------------------------------------------------------------------------- /benchmarks/images/method_per_p_cv_tmt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/method_per_p_cv_tmt.png -------------------------------------------------------------------------------- /benchmarks/images/missing_peptides_by_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/missing_peptides_by_sample.png -------------------------------------------------------------------------------- /benchmarks/images/missing_value_016999_lfq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/missing_value_016999_lfq.png -------------------------------------------------------------------------------- /benchmarks/images/per_protein_cv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/per_protein_cv.png -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/data/__init__.py -------------------------------------------------------------------------------- /data/contaminants_ids.tsv: -------------------------------------------------------------------------------- 1 | P00761 2 | Q32MB2 3 | P19013 4 | Q7RTT2 5 | P15636 6 | P09870 7 | Q9R4J5 8 | P0C1U8 9 | P00766 10 | P13717 11 | Q9U6Y5 12 | P21578 13 | O76009 14 | O76011 15 | O76013 16 | O76014 17 | O76015 18 | P08779 19 | Q14525 20 | Q14532 21 | Q15323 22 | Q92764 23 | Q14533 24 | Q9NSB4 25 | P78385 26 | Q9NSB2 27 | P78386 28 | O43790 29 | Q6IFU5 30 | Q9UE12 31 | Q8IUT8 32 | Q6NT21 33 | Q6ISB0 34 | Q6NTB9 35 | Q6IFU6 36 | P04264 37 | P13647 38 | P35908 39 | P13645 40 | P35527 41 | A3EZ79 42 | P02533 43 | P02538 44 | P48668 45 | P04259 46 | A3EZ82 47 | Q2KIG3 48 | Q0VCM5 49 | Q3SZ57 50 | Q9N2I2 51 | Q3SZH5 52 | P28800 53 | Q1A7A4 54 | P41361 55 | Q2YDI2 56 | Q3Y5Z3 57 | P81644 58 | Q2KJ83 59 | Q2KIT0 60 | A2I7N3 61 | Q3SZV7 62 | Q2KJC7 63 | Q3SZR3 64 | Q28107 65 | P02672 66 | Q1RMN8 67 | Q58D62 68 | P06868 69 | Q2KJF1 70 | P02584 71 | P02777 72 | Q3SX14 73 | P17697 74 | Q6T181 75 | P34955 76 | P21752 77 | Q32PJ2 78 | Q28194 79 | P00978 80 | Q5XQN5 81 | Q32PI4 82 | Q9TTE1 83 | Q2KIU3 84 | P01044-1 85 | P67983 86 | Q28065 87 | Q862S4 88 | Q2KIF2 89 | Q3SX28 90 | Q0V8M9 91 | Q148H6 92 | Q29RQ1 93 | Q95M17 94 | P07224 95 | Q2HJF0 96 | Q2KIH2 97 | P13646-1 98 | Q04695 99 | A2I7N0 100 | P12763 101 | P17690 102 | P02769 103 | P02676 104 | P50448 105 | P01030 106 | P01966 107 | P02768-1 108 | P00735 109 | Q03247 110 | Q3ZBS7 111 | Q2UVX4 112 | Q9TT36 113 | Q28085 114 | Q3SX09 115 | P01045-1 116 | Q3ZBD7 117 | Q3MHN2 118 | Q9TRI1 119 | P15497 120 | Q95121 121 | Q05443 122 | P02070 123 | Q2KIS7 124 | Q3MHH8 125 | Q3T052 126 | Q3KUS7 127 | Q1RMK2 128 | Q2TBQ1 129 | Q05B55 130 | A2I7N1 131 | P04258 132 | Q2KJ62 133 | Q0IIK2 134 | Q3MHN5 135 | P02662 136 | P02663 137 | P02666 138 | P02668 139 | P31096 140 | P02754 141 | P00711 142 | P62894 143 | Q29443 144 | P19001 145 | A2AB72 146 | Q8VED5 147 | Q61726 148 | Q3ZAW8 149 | P50446 150 | Q497I4 151 | Q9D312 152 | P08730-1 153 | Q922U2 154 | Q8BGZ7 155 | A2A4G1 156 | Q9QWL7 157 | Q6IME9 158 | Q6NXH9 159 | A2VCT4 160 | P07744 161 | Q6IFZ6 162 | Q6IFX2 163 | Q9R0H5 164 | Q3TTY5 165 | Q0VBK2 166 | P02535-1 167 | Q61782 168 | A2A5Y0 169 | Q99PS0 170 | Q9D646 171 | P05784 172 | Q9DCV7 173 | Q9Z2K1 174 | P07477 175 | P05787 176 | Q6KB66-1 177 | Q7Z794 178 | Q9BYR9 179 | Q9BYQ5 180 | Q9BYR8 181 | Q9BYQ7 182 | Q3LI72 183 | Q9BYR4 184 | Q9BYQ8 185 | P60413 186 | P19012 187 | Q2M2I5 188 | O95678 189 | Q01546 190 | Q99456 191 | Q9H552 192 | P35900 193 | Q3SY84 194 | Q8N1A0 195 | Q8N1N4-2 196 | Q5XKE5 197 | P12035 198 | Q9C075 199 | P08729 200 | Q7Z3Y8 201 | Q7RTS7 202 | Q7Z3Y9 203 | Q7Z3Z0 204 | Q7Z3Y7 205 | P08727 206 | Q14CN4-1 207 | Q3KNV1 208 | Q86YZ3 209 | P20930 210 | Q5D862 211 | SPA34_BOVIN 212 | SPA35_BOVIN 213 | SPA37_BOVIN 214 | KRT86_HUMAN 215 | KT33A_HUMAN 216 | KRT34_HUMAN 217 | KRT36_HUMAN 218 | KRT36_HUMAN 219 | KRT37_HUMAN 220 | KRT38_HUMAN 221 | K2C75_HUMAN 222 | LALBA_BOVIN 223 | THRB_BOVIN 224 | TRYP_PIG 225 | CTRA_BOVIN 226 | AMBP_BOVIN 227 | CO4_BOVIN 228 | KNG1_BOVIN 229 | KNG1_BOVIN 230 | KNG1_BOVIN 231 | KNG2_BOVIN 232 | HBA_BOVIN 233 | HBB_BOVIN 234 | HBBF_BOVIN 235 | K1C14_HUMAN 236 | K1C10_MOUSE 237 | K2C6A_HUMAN 238 | PROF1_BOVIN 239 | CASA1_BOVIN 240 | CASA2_BOVIN 241 | CASB_BOVIN 242 | CASK_BOVIN 243 | FIBA_BOVIN 244 | FIBB_BOVIN 245 | LACB_BOVIN 246 | ALBU_HUMAN 247 | ALBU_BOVIN 248 | PLF4_BOVIN 249 | CO3A1_BOVIN 250 | K2C6B_HUMAN 251 | K2C1_HUMAN 252 | K1C18_MOUSE 253 | K2C8_HUMAN 254 | K2C8_HUMAN 255 | PLMN_BOVIN 256 | PROS_BOVIN 257 | TRY1_HUMAN 258 | K2C4_MOUSE 259 | K1C19_HUMAN 260 | K2C7_HUMAN 261 | K1C13_MOUSE 262 | K1C16_HUMAN 263 | CLOS_HATHI 264 | SSPA_STAAU 265 | K2C3_HUMAN 266 | FETUA_BOVIN 267 | K1C10_HUMAN 268 | K1C13_HUMAN 269 | K2C5_HUMAN 270 | NUCA_SERMA 271 | APOA1_BOVIN 272 | API_ACHLY 273 | APOH_BOVIN 274 | CLUS_BOVIN 275 | K1C19_MOUSE 276 | K1C15_HUMAN 277 | K1C15_HUMAN 278 | K2C4_HUMAN 279 | FILA_HUMAN 280 | LUXY_ALIFS 281 | TYB10_BOVIN 282 | A2AP_BOVIN 283 | OSTP_BOVIN 284 | A1AT_BOVIN 285 | K1C9_HUMAN 286 | K1C20_HUMAN 287 | K22E_HUMAN 288 | ANT3_BOVIN 289 | K2C6C_HUMAN 290 | K2C6A_MOUSE 291 | F12AI_BOVIN 292 | ITIH3_BOVIN 293 | KR10C_HUMAN 294 | CYC_BOVIN 295 | MT1A_BOVIN 296 | KRT83_HUMAN 297 | KRT85_HUMAN 298 | APOA2_BOVIN 299 | K22O_HUMAN 300 | APOE_BOVIN 301 | K1C17_HUMAN 302 | LUM_BOVIN 303 | K2C80_MOUSE 304 | ITIH1_BOVIN 305 | KT33B_HUMAN 306 | K1H2_HUMAN 307 | KRT81_HUMAN 308 | K1C28_BOVIN 309 | K2C72_HUMAN 310 | K1H1_HUMAN 311 | C4BPA_BOVIN 312 | CFAH_BOVIN 313 | FA5_BOVIN 314 | TRFE_BOVIN 315 | CO7_BOVIN 316 | CBPB2_BOVIN 317 | TETN_BOVIN 318 | HP20_BOVIN 319 | HP252_BOVIN 320 | CBPN_BOVIN 321 | A1BG_BOVIN 322 | K1C24_HUMAN 323 | CO3_BOVIN 324 | ORC4_BOVIN 325 | APOA4_BOVIN 326 | KR195_HUMAN 327 | CO9_BOVIN 328 | VTDB_BOVIN 329 | GELS_BOVIN 330 | K2C71_HUMAN 331 | FETA_BOVIN 332 | A1AG_BOVIN 333 | HEMO_BOVIN 334 | ITIH4_BOVIN 335 | K22E_MOUSE 336 | ADIPO_BOVIN 337 | G6PI_BOVIN 338 | KRT35_MOUSE 339 | FETUB_BOVIN 340 | FILA2_HUMAN 341 | TPM2_BOVIN 342 | TPM2_BOVIN 343 | K2C79_HUMAN 344 | K2C5_BOVIN 345 | K1H1_MOUSE 346 | K1C40_HUMAN 347 | K1C39_HUMAN 348 | DMKN_HUMAN 349 | DMKN_HUMAN 350 | DMKN_HUMAN 351 | DMKN_HUMAN 352 | DMKN_HUMAN 353 | DMKN_HUMAN 354 | DMKN_HUMAN 355 | DMKN_HUMAN 356 | DMKN_HUMAN 357 | DMKN_HUMAN 358 | DMKN_HUMAN 359 | DMKN_HUMAN 360 | DMKN_HUMAN 361 | DMKN_HUMAN 362 | DMKN_HUMAN 363 | DMKN_HUMAN 364 | K1C42_MOUSE 365 | K1C39_MOUSE 366 | K2C1B_MOUSE 367 | K2C72_MOUSE 368 | K2C80_HUMAN 369 | K2C73_MOUSE 370 | K2C74_HUMAN 371 | K1C28_HUMAN 372 | K1C27_HUMAN 373 | K1C26_HUMAN 374 | K1C25_HUMAN 375 | K2C1B_HUMAN 376 | K2C73_HUMAN 377 | K2C73_HUMAN 378 | HORN_HUMAN 379 | K2C75_MOUSE 380 | KT222_HUMAN 381 | KT222_HUMAN 382 | K2C78_HUMAN 383 | K2C78_HUMAN 384 | K2C79_MOUSE 385 | K2C5_MOUSE 386 | KRT35_HUMAN 387 | PEDF_BOVIN 388 | CHIA_BOVIN 389 | K1C12_HUMAN 390 | K1C23_MOUSE 391 | KRA46_HUMAN 392 | KRA41_HUMAN 393 | KRA49_HUMAN 394 | KRA43_HUMAN 395 | KRA31_HUMAN 396 | KRA24_HUMAN 397 | K1C23_HUMAN 398 | K1C23_HUMAN 399 | K1C20_MOUSE 400 | KRT34_MOUSE 401 | K2C7_MOUSE 402 | IPSP_BOVIN 403 | KRT84_HUMAN 404 | KRT82_HUMAN 405 | K1C17_MOUSE 406 | K2C71_MOUSE 407 | ASPN_PSEFR 408 | THBG_BOVIN 409 | SPA31_BOVIN 410 | GFPL1_ZOASP 411 | K1C16_MOUSE 412 | trY2_BOVIN 413 | trY1_BOVIN 414 | Streptavidin 415 | REFSEQ:XP_986630 416 | REFSEQ:XP_001474382 417 | REFSEQ:XP_092267 418 | REFSEQ:XP_932229 419 | H-INV:HIT000016045 420 | H-INV:HIT000292931 421 | H-INV:HIT000015463 422 | ENSEMBL:ENSP00000377550 423 | ENSEMBL:ENSBTAP00000006074 424 | ENSEMBL:ENSBTAP00000038329 425 | REFSEQ:XP_001252647 426 | ENSEMBL:ENSBTAP00000007350 427 | ENSEMBL:ENSBTAP00000038253 428 | ENSEMBL:ENSBTAP00000023402 429 | ENSEMBL:ENSBTAP00000024466 430 | ENSEMBL:ENSBTAP00000023055 431 | ENSEMBL:ENSBTAP00000018229 432 | ENSEMBL:ENSBTAP00000016046 433 | ENSEMBL:ENSBTAP00000024462 434 | ENSEMBL:ENSBTAP00000014147 435 | ENSEMBL:ENSBTAP00000033053 436 | ENSEMBL:ENSBTAP00000001528 437 | ENSEMBL:ENSBTAP00000037665 438 | ENSEMBL:ENSBTAP00000031900 439 | ENSEMBL:ENSBTAP00000031360 440 | ENSEMBL:ENSBTAP00000018574 441 | ENSEMBL:ENSBTAP00000032840 442 | ENSEMBL:ENSBTAP00000011227 443 | ENSEMBL:ENSBTAP00000025008 444 | ENSEMBL:ENSBTAP00000034412 445 | ENSEMBL:ENSBTAP00000013050 446 | ENSEMBL:ENSBTAP00000016285 447 | ENSEMBL:ENSBTAP00000024146 448 | REFSEQ:XP_58501 449 | -------------------------------------------------------------------------------- /data/high_abundant_proteins.tsv: -------------------------------------------------------------------------------- 1 | P68871 2 | HBB_HUMAN 3 | 4 | -------------------------------------------------------------------------------- /data/histones.json: -------------------------------------------------------------------------------- 1 | { 2 | "HUMAN": { 3 | "name": "human", 4 | "genome_size": 3220000000, 5 | "histone_proteins": [ 6 | "P07305", 7 | "Q8IZA3", 8 | "Q92522", 9 | "P0C5Y9", 10 | "P0C5Z0", 11 | "H0YFX9", 12 | "Q9BTM1", 13 | "A8MQC5", 14 | "C9J0D1", 15 | "C9J386", 16 | "E5RJU1", 17 | "Q71UI9", 18 | "P16104", 19 | "B4DJC3", 20 | "D6RCF2", 21 | "O75367", 22 | "Q5SQT3", 23 | "Q9P0M6", 24 | "P0C0S5", 25 | "P0C1H6", 26 | "A9UJN3", 27 | "P57053", 28 | "Q7Z2G1", 29 | "B4DEB1", 30 | "P84243", 31 | "B2R4P9", 32 | "K7EMV3", 33 | "K7ES00", 34 | "K7EK07", 35 | "K7EP01", 36 | "Q6NXT2", 37 | "Q02539", 38 | "P16401", 39 | "P16403", 40 | "P16402", 41 | "Q4VB24", 42 | "P10412", 43 | "A3R0T8", 44 | "A1L407", 45 | "P22492", 46 | "Q96QV6", 47 | "P04908", 48 | "Q08AJ9", 49 | "Q93077", 50 | "P20671", 51 | "P0C0S8", 52 | "A3KPC7", 53 | "Q96KK5", 54 | "Q99878", 55 | "A4FTV9", 56 | "Q92646", 57 | "Q96A08", 58 | "P33778", 59 | "P62807", 60 | "P58876", 61 | "B2R4S9", 62 | "Q93079", 63 | "P06899", 64 | "O60814", 65 | "Q99880", 66 | "I6L9F7", 67 | "Q99879", 68 | "Q99877", 69 | "P23527", 70 | "P68431", 71 | "P62805", 72 | "Q99525", 73 | "Q0VAS5", 74 | "B2R4R0", 75 | "Q6FI13", 76 | "Q8IUE6", 77 | "Q16777", 78 | "Q16778", 79 | "B4DR52", 80 | "Q5QNW6", 81 | "Q71DI3", 82 | "Q5TEC6", 83 | "Q7L7L0", 84 | "Q8N257", 85 | "Q16695", 86 | "Q6TXQ4", 87 | "Q14463", 88 | "B4E0B3", 89 | "B2R5B6", 90 | "A2RUA4", 91 | "B2R5B3", 92 | "Q9HA11", 93 | "A8K9J7", 94 | "B2R6Y1", 95 | "B4E380", 96 | "A8K4Y7", 97 | "Q6B823", 98 | "Q6LBZ2", 99 | "A3R0T7" 100 | ], 101 | "histone_entries": [ 102 | "H2AW_HUMAN", 103 | "Q9HA11_HUMAN", 104 | "H2AJ_HUMAN", 105 | "H2B1L_HUMAN", 106 | "H2B1M_HUMAN", 107 | "H2A1J_HUMAN", 108 | "H2B1N_HUMAN", 109 | "H4G_HUMAN", 110 | "H2A1A_HUMAN", 111 | "H2A1H_HUMAN", 112 | "H2B1A_HUMAN", 113 | "H2B1H_HUMAN", 114 | "H2A1C_HUMAN", 115 | "Q92646_HUMAN", 116 | "H1X_HUMAN", 117 | "H2B3B_HUMAN", 118 | "H18_HUMAN", 119 | "H2A2B_HUMAN", 120 | "H2BWT_HUMAN", 121 | "H2A3_HUMAN", 122 | "H2AV_HUMAN", 123 | "H2AV_HUMAN", 124 | "H32_HUMAN", 125 | "Q6TXQ4_HUMAN", 126 | "H3C_HUMAN", 127 | "Q6LBZ2_HUMAN", 128 | "H2A2A_HUMAN", 129 | "Q6B823_HUMAN", 130 | "H37_HUMAN", 131 | "Q5SQT3_HUMAN", 132 | "H2B2F_HUMAN", 133 | "Q4VB24_HUMAN", 134 | "H2B2E_HUMAN", 135 | "H2A2C_HUMAN", 136 | "H31T_HUMAN", 137 | "Q14463_HUMAN", 138 | "Q0VAS5_HUMAN", 139 | "Q08AJ9_HUMAN", 140 | "H11_HUMAN", 141 | "H33_HUMAN", 142 | "H31_HUMAN", 143 | "H2B1C_HUMAN", 144 | "H4_HUMAN", 145 | "H2B1D_HUMAN", 146 | "H2BFS_HUMAN", 147 | "H2B1B_HUMAN", 148 | "H2B1O_HUMAN", 149 | "H1T_HUMAN", 150 | "H2A1D_HUMAN", 151 | "H12_HUMAN", 152 | "H13_HUMAN", 153 | "H15_HUMAN", 154 | "H2AX_HUMAN", 155 | "H14_HUMAN", 156 | "H2AB2_HUMAN", 157 | "H2AB1_HUMAN", 158 | "H2BFM_HUMAN", 159 | "H2A1_HUMAN", 160 | "H2AZ_HUMAN", 161 | "H10_HUMAN", 162 | "H2B1J_HUMAN", 163 | "H2A1B_HUMAN", 164 | "H2AY_HUMAN", 165 | "H2B1K_HUMAN", 166 | "K7ES00_HUMAN", 167 | "K7EP01_HUMAN", 168 | "K7EMV3_HUMAN", 169 | "K7EK07_HUMAN", 170 | "I6L9F7_HUMAN", 171 | "H0YFX9_HUMAN", 172 | "E5RJU1_HUMAN", 173 | "D6RCF2_HUMAN", 174 | "C9J386_HUMAN", 175 | "C9J0D1_HUMAN", 176 | "B4E380_HUMAN", 177 | "B4E0B3_HUMAN", 178 | "B4DR52_HUMAN", 179 | "B4DJC3_HUMAN", 180 | "B4DEB1_HUMAN", 181 | "B2R6Y1_HUMAN", 182 | "B2R5B6_HUMAN", 183 | "B2R5B3_HUMAN", 184 | "B2R4S9_HUMAN", 185 | "B2R4R0_HUMAN", 186 | "B2R4P9_HUMAN", 187 | "A9UJN3_HUMAN", 188 | "A8K9J7_HUMAN", 189 | "A8K4Y7_HUMAN", 190 | "A4FTV9_HUMAN", 191 | "A3R0T8_HUMAN", 192 | "A3R0T7_HUMAN", 193 | "A3KPC7_HUMAN", 194 | "A2RUA4_HUMAN", 195 | "A1L407_HUMAN" 196 | ] 197 | }, 198 | "MOUSE": { 199 | "name": "mouse", 200 | "genome_size": 2800000000, 201 | "histone_proteins": [ 202 | "Q9DAD9", 203 | "B2RTM0", 204 | "Q8CBB6", 205 | "Q921L4", 206 | "Q5M8Q2", 207 | "Q810S6", 208 | "B1AV31", 209 | "Q497L1", 210 | "A9Z055", 211 | "Q8CGP9", 212 | "P10922", 213 | "Q8CJI4", 214 | "E0CZ52", 215 | "E0CYL2", 216 | "Q8VIK3", 217 | "Q80ZM5", 218 | "Q9CQ70", 219 | "Q8R1M2", 220 | "Q3THW5", 221 | "Q8R029", 222 | "B2RVP5", 223 | "P27661", 224 | "Q9QZQ8", 225 | "Q8CA90", 226 | "Q8BP16", 227 | "Q9CTR1", 228 | "Q8CCK0", 229 | "Q9D3V6", 230 | "Q9D3U7", 231 | "Q3UA95", 232 | "Q3TFU6", 233 | "G3UWL7", 234 | "G3UX40", 235 | "P0C0S6", 236 | "F8WI35", 237 | "E0CZ27", 238 | "E0CYN1", 239 | "E0CYR7", 240 | "P84244", 241 | "P02301", 242 | "Q9QYL0", 243 | "P43275", 244 | "P43276", 245 | "P15864", 246 | "Q5SZA3", 247 | "P43277", 248 | "Q149Z9", 249 | "P43274", 250 | "Q07133", 251 | "I7HFT9", 252 | "Q8CGP4", 253 | "P22752", 254 | "B2RVF0", 255 | "Q61668", 256 | "Q8CGP5", 257 | "A0AUV1", 258 | "Q8CGP6", 259 | "A3KPD0", 260 | "Q8CGP7", 261 | "F8WIX8", 262 | "A0JNS9", 263 | "P70696", 264 | "Q64475", 265 | "Q6ZWY9", 266 | "P10853", 267 | "Q64478", 268 | "A0JLV3", 269 | "Q8CGP1", 270 | "B2RVD5", 271 | "P10854", 272 | "B2RTK3", 273 | "Q8CGP2", 274 | "P68433", 275 | "P84228", 276 | "A1L0U3", 277 | "A1L0V4", 278 | "P62806", 279 | "B2RWH3", 280 | "Q6GSS7", 281 | "Q64522", 282 | "Q64523", 283 | "Q149V4", 284 | "Q64525", 285 | "G3X9D5", 286 | "Q64524", 287 | "B9EI85", 288 | "Q61667", 289 | "Q8BFU2", 290 | "A2AB79", 291 | "Q9D2U9", 292 | "Q8CGP0", 293 | "Q6B822", 294 | "P07978", 295 | "Q9D9Z7" 296 | ] 297 | }, 298 | "DROME": { 299 | "name": "drome", 300 | "genome_size": 144000000, 301 | "histone_proteins": [ 302 | "Q6TXQ1", 303 | "P02255", 304 | "Q4AB54", 305 | "Q4ABE3", 306 | "Q4ABD8", 307 | "Q4AB94", 308 | "P84051", 309 | "Q4AB57", 310 | "P08985", 311 | "P02283", 312 | "P02299", 313 | "E2QCP0", 314 | "P84249", 315 | "P84040" 316 | ], 317 | "histone_entries": [ 318 | "Q9DAD9_MOUSE", 319 | "B2RTM0_MOUSE", 320 | "Q8CBB6_MOUSE", 321 | "Q921L4_MOUSE", 322 | "H2AL1_MOUSE", 323 | "Q810S6_MOUSE", 324 | "Q9DAD9_MOUSE", 325 | "Q497L1_MOUSE", 326 | "A9Z055_MOUSE", 327 | "Q8CGP9_MOUSE", 328 | "H10_MOUSE", 329 | "H1FNT_MOUSE", 330 | "E0CZ52_MOUSE", 331 | "E0CYL2_MOUSE", 332 | "H18_MOUSE", 333 | "Q80ZM5_MOUSE", 334 | "H2AB1_MOUSE", 335 | "H2AJ_MOUSE", 336 | "H2AV_MOUSE", 337 | "Q8R029_MOUSE", 338 | "B2RVP5_MOUSE", 339 | "H2AX_MOUSE", 340 | "H2AY_MOUSE", 341 | "Q8CA90_MOUSE", 342 | "Q8BP16_MOUSE", 343 | "Q9CTR1_MOUSE", 344 | "H2AW_MOUSE", 345 | "Q9D3V6_MOUSE", 346 | "Q9D3U7_MOUSE", 347 | "Q3UA95_MOUSE", 348 | "Q3TFU6_MOUSE", 349 | "G3UWL7_MOUSE", 350 | "G3UX40_MOUSE", 351 | "H2AZ_MOUSE", 352 | "F8WI35_MOUSE", 353 | "E0CZ27_MOUSE", 354 | "E0CYN1_MOUSE", 355 | "E0CYR7_MOUSE", 356 | "H33_MOUSE", 357 | "H3C_MOUSE", 358 | "HILS1_MOUSE", 359 | "H11_MOUSE", 360 | "H15_MOUSE", 361 | "H12_MOUSE", 362 | "Q5SZA3_MOUSE", 363 | "H13_MOUSE", 364 | "Q149Z9_MOUSE", 365 | "H14_MOUSE", 366 | "H1T_MOUSE", 367 | "I7HFT9_MOUSE", 368 | "Q8CGP4_MOUSE", 369 | "H2A1P_MOUSE", 370 | "H2A1B_MOUSE", 371 | "H2A1C_MOUSE", 372 | "H2A1D_MOUSE", 373 | "H2A1E_MOUSE", 374 | "H2A1G_MOUSE", 375 | "H2A1I_MOUSE", 376 | "H2A1N_MOUSE", 377 | "H2A1O_MOUSE", 378 | "B2RVF0_MOUSE", 379 | "Q61668_MOUSE", 380 | "H2A1F_MOUSE", 381 | "A0AUV1_MOUSE", 382 | "H2A1H_MOUSE", 383 | "A3KPD0_MOUSE", 384 | "H2A1K_MOUSE", 385 | "A0JNS9_MOUSE", 386 | "H2B1A_MOUSE", 387 | "H2B1B_MOUSE", 388 | "H2B1C_MOUSE", 389 | "H2B1F_MOUSE", 390 | "H2B1H_MOUSE", 391 | "A0JLV3_MOUSE", 392 | "H2B1K_MOUSE", 393 | "B2RVD5_MOUSE", 394 | "H2B1M_MOUSE", 395 | "B2RTK3_MOUSE", 396 | "H2B1P_MOUSE", 397 | "H31_MOUSE", 398 | "H32_MOUSE", 399 | "A1L0U3_MOUSE", 400 | "A1L0V4_MOUSE", 401 | "H4_MOUSE", 402 | "B2RWH3_MOUSE", 403 | "H2A2A_MOUSE", 404 | "H2A2B_MOUSE", 405 | "H2A2C_MOUSE", 406 | "Q149V4_MOUSE", 407 | "H2B2B_MOUSE", 408 | "H2B2E_MOUSE", 409 | "B9EI85_MOUSE", 410 | "Q61667_MOUSE", 411 | "H2A3_MOUSE", 412 | "A2AB79_MOUSE", 413 | "H2B3A_MOUSE", 414 | "H2B3B_MOUSE", 415 | "Q6B822_MOUSE", 416 | "PRM2_MOUSE", 417 | "H2BL1_MOUSE" 418 | ] 419 | }, 420 | "CAEEL": { 421 | "name": "caeel", 422 | "genome_size": 104000000, 423 | "histone_proteins": [ 424 | "P10771", 425 | "P15796", 426 | "Q19743", 427 | "O17536", 428 | "O01833", 429 | "Q9U3W3", 430 | "Q18336", 431 | "P09588", 432 | "J7S164", 433 | "J7SA65", 434 | "Q27485", 435 | "Q23429", 436 | "Q27511", 437 | "P04255", 438 | "Q27894", 439 | "P08898", 440 | "K7ZUH9", 441 | "Q10453", 442 | "Q9U281", 443 | "Q27490", 444 | "Q27532", 445 | "P62784", 446 | "Q27484", 447 | "Q27876", 448 | "O16277", 449 | "Q27489" 450 | ], 451 | "histone_entries": [ 452 | "H24_CAEEL", 453 | "H12_CAEEL", 454 | "H13_CAEEL", 455 | "H14_CAEEL", 456 | "H15_CAEEL", 457 | "Q9U3W3_CAEEL", 458 | "H1X_CAEEL", 459 | "H2A_CAEEL", 460 | "J7S164_CAEEL", 461 | "J7SA65_CAEEL", 462 | "Q27485_CAEEL", 463 | "Q23429_CAEEL", 464 | "H2AV_CAEEL", 465 | "H2B1_CAEEL", 466 | "H2B2_CAEEL", 467 | "H3_CAEEL", 468 | "K7ZUH9_CAEEL", 469 | "H331_CAEEL", 470 | "H332_CAEEL", 471 | "H33L1_CAEEL", 472 | "H33L2_CAEEL", 473 | "H4_CAEEL", 474 | "H2B3_CAEEL", 475 | "H2B4_CAEEL", 476 | "H16_CAEEL", 477 | "H33L3_CAEEL" 478 | ] 479 | }, 480 | "YEAST": { 481 | "name": "yeast", 482 | "genome_size": 12100000, 483 | "histone_proteins": [ 484 | "P53551", 485 | "P04911", 486 | "P04912", 487 | "Q12692", 488 | "P02293", 489 | "P02294", 490 | "P61830", 491 | "P02309" 492 | ], 493 | "histone_entries": [ 494 | "H1_YEAST", 495 | "H2A1_YEAST", 496 | "H2A2_YEAST", 497 | "H2AZ_YEAST", 498 | "H2B1_YEAST", 499 | "H2B2_YEAST", 500 | "H3_YEAST", 501 | "H4_YEAST" 502 | ] 503 | }, 504 | "SCHPO": { 505 | "name": "schpo", 506 | "genome_size": 14100000, 507 | "histone_proteins": [ 508 | "P48003", 509 | "P04909", 510 | "P04910", 511 | "P04913", 512 | "P09988", 513 | "P10651", 514 | "P09322" 515 | ], 516 | "histone_entries": [ 517 | "H2AZ_SCHPO", 518 | "H2A1_SCHPO", 519 | "H2A2_SCHPO", 520 | "H2B1_SCHPO", 521 | "H31_SCHPO", 522 | "H33_SCHPO", 523 | "H4_SCHPO" 524 | ] 525 | } 526 | } 527 | -------------------------------------------------------------------------------- /data/ibaqpy.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/data/ibaqpy.drawio.png -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | # You can use this file to create a conda environment for this pipeline: 2 | # conda env create -f environment.yml 3 | name: ibaqpy 4 | channels: 5 | - openms 6 | - conda-forge 7 | - bioconda 8 | dependencies: 9 | - python>=3.9 10 | - scikit-learn 11 | - pyopenms 12 | - numpy<2.1.0 13 | - click 14 | - pandas 15 | - matplotlib 16 | - pyarrow>=16.1.0 17 | - duckdb>=0.10.1 18 | - qnorm 19 | - scipy>=1.10 20 | - seaborn>=0.13.2 21 | - typing_extensions>=4.6.3 22 | - inmoose 23 | -------------------------------------------------------------------------------- /ibaqpy/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | ibaqpy - A Python package for iBAQ (intensity-based absolute quantification) analysis. 3 | 4 | This package provides tools for processing and analyzing proteomics data using 5 | the iBAQ method, which allows for absolute quantification of proteins. 6 | """ 7 | 8 | import warnings 9 | 10 | # Suppress numpy matrix deprecation warning 11 | warnings.filterwarnings( 12 | "ignore", category=PendingDeprecationWarning, module="numpy.matrixlib.defmatrix" 13 | ) 14 | 15 | __version__ = "0.0.5" 16 | 17 | # Import logging configuration 18 | from ibaqpy.ibaq.logging_config import initialize_logging 19 | 20 | # Initialize logging with default settings 21 | # Users can override these settings by calling initialize_logging with their own settings 22 | initialize_logging() 23 | -------------------------------------------------------------------------------- /ibaqpy/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/ibaqpy/commands/__init__.py -------------------------------------------------------------------------------- /ibaqpy/commands/correct_batches.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from pathlib import Path 4 | from typing import Union 5 | 6 | import click 7 | import pandas as pd 8 | 9 | from ibaqpy.ibaq.file_utils import create_anndata, combine_ibaq_tsv_files 10 | from ibaqpy.ibaq.ibaqpy_commons import SAMPLE_ID_REGEX, SAMPLE_ID, PROTEIN_NAME, IBAQ, IBAQ_BEC 11 | from ibaqpy.ibaq.ibaqpy_postprocessing import ( 12 | pivot_wider, 13 | pivot_longer, 14 | ) 15 | from ibaqpy.ibaq.utils import apply_batch_correction 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | logger.addHandler(logging.NullHandler()) 20 | 21 | 22 | def is_valid_sample_id( 23 | samples: Union[str, list, pd.Series], sample_id_pattern: str = SAMPLE_ID_REGEX 24 | ) -> bool: 25 | """ 26 | Validate sample IDs against a specified pattern. 27 | 28 | This function checks whether the provided sample IDs match a given regex pattern. 29 | It accepts a single sample ID, a list of sample IDs, or a pandas Series of sample IDs. 30 | If any sample ID does not match the pattern, it prints the invalid IDs and returns False. 31 | Otherwise, it returns True. 32 | 33 | Parameters 34 | ---------- 35 | samples : Union[str, list, pd.Series] 36 | The sample ID(s) to validate. 37 | sample_id_pattern : str, optional 38 | The regex pattern to validate the sample IDs against. Defaults to 'SAMPLE_ID_REGEX'. 39 | 40 | Returns 41 | ------- 42 | bool 43 | True if all sample IDs are valid, False otherwise. 44 | """ 45 | sample_pattern = re.compile(sample_id_pattern) 46 | 47 | # Ensure samples is a list for uniform processing 48 | if isinstance(samples, str): 49 | samples = [samples] 50 | elif isinstance(samples, pd.Series): 51 | samples = samples.tolist() 52 | 53 | # Identify invalid sample names. 54 | invalid_samples = [sample for sample in samples if not sample_pattern.fullmatch(sample)] 55 | 56 | if invalid_samples: 57 | logger.error("The following sample IDs are invalid:") 58 | for invalid_sample in invalid_samples: 59 | logger.error(f" - {invalid_sample}") 60 | return False 61 | return True 62 | 63 | 64 | def get_batch_id_from_sample_names(samples: list) -> list: 65 | """ 66 | Extract batch IDs from a list of sample names. 67 | 68 | Each sample name is expected to have a batch ID as a prefix, separated by a hyphen. 69 | The function validates that the batch ID consists of alphanumeric characters only. 70 | Returns a list of unique batch IDs as integer factors. 71 | 72 | Parameters 73 | ---------- 74 | samples : list 75 | A list of sample names, each containing a batch ID prefix. 76 | 77 | Returns 78 | ------- 79 | list 80 | A list of integer factors representing unique batch IDs. 81 | 82 | Raises 83 | ------ 84 | ValueError 85 | If a sample name does not contain a valid batch ID prefix or if the 86 | batch ID contains non-alphanumeric characters. 87 | """ 88 | batch_ids = [] 89 | for sample in samples: 90 | parts = sample.split("-") 91 | if not parts or not parts[0]: 92 | raise ValueError(f"Invalid sample name format: {sample}. Expected batch-id prefix.") 93 | batch_id = parts[0] 94 | if not re.match(r"^[A-Za-z0-9]+$", batch_id): 95 | raise ValueError( 96 | f"Invalid batch ID format: {batch_id}. Expected alphanumeric characters only." 97 | ) 98 | batch_ids.append(batch_id) 99 | return pd.factorize(batch_ids)[0] 100 | 101 | 102 | def run_batch_correction( 103 | folder: str, 104 | pattern: str, 105 | comment: str, 106 | sep: str, 107 | output: str, 108 | sample_id_column: str = SAMPLE_ID, 109 | protein_id_column: str = PROTEIN_NAME, 110 | ibaq_raw_column: str = IBAQ, 111 | ibaq_corrected_column: str = IBAQ_BEC, 112 | export_anndata: bool = False, 113 | ) -> pd.DataFrame: 114 | """ 115 | Run batch correction on iBAQ data from TSV files in a specified directory. 116 | 117 | This function combines multiple TSV files, reshapes the data, validates sample IDs, 118 | applies batch correction, and optionally exports the results to an AnnData object. 119 | 120 | Parameters 121 | ---------- 122 | folder : str 123 | Directory containing the TSV files. 124 | pattern : str 125 | Pattern to match files in the directory. 126 | comment : str 127 | Character indicating the start of a comment line in the TSV files. 128 | sep : str 129 | Delimiter for reading the TSV files. 130 | output : str 131 | File path to save the corrected iBAQ values. 132 | sample_id_column : str, optional 133 | Column name for sample IDs. Defaults to 'SAMPLE_ID'. 134 | protein_id_column : str, optional 135 | Column name for protein IDs. Defaults to 'PROTEIN_NAME'. 136 | ibaq_raw_column : str, optional 137 | Column name for raw iBAQ values. Defaults to 'IBAQ'. 138 | ibaq_corrected_column : str, optional 139 | Column name for corrected iBAQ values. Defaults to 'IBAQ_BEC'. 140 | export_anndata : bool, optional 141 | Whether to export the data to an AnnData object. Defaults to False. 142 | 143 | Returns 144 | ------- 145 | pd.DataFrame 146 | DataFrame containing the original and corrected iBAQ values. 147 | 148 | Raises 149 | ------ 150 | ValueError 151 | If input files cannot be loaded, sample IDs are invalid, or output file cannot be saved. 152 | FileNotFoundError 153 | If the output file does not exist when exporting to AnnData. 154 | """ 155 | 156 | # Load the data 157 | logger.info(f"Loading iBAQ data from TSV files in folder '{folder}'") 158 | 159 | try: 160 | df_ibaq = combine_ibaq_tsv_files(folder, pattern=pattern, comment=comment, sep=sep) 161 | except Exception as e: 162 | raise ValueError(f"Failed to load input files: {str(e)}") 163 | 164 | # Reshape the data to wide format 165 | df_wide = pivot_wider( 166 | df_ibaq, 167 | row_name=protein_id_column, 168 | col_name=sample_id_column, 169 | values=ibaq_raw_column, 170 | fillna=True, 171 | ) 172 | 173 | # Validate the sample IDs 174 | if not is_valid_sample_id(df_wide.columns, SAMPLE_ID_REGEX): 175 | raise ValueError("Invalid sample IDs found in the data.") 176 | 177 | # Get the batch IDs 178 | batch_ids = get_batch_id_from_sample_names(df_wide.columns) 179 | 180 | # Run batch correction 181 | logger.info("Applying batch correction to iBAQ values") 182 | df_corrected = apply_batch_correction(df_wide, list(batch_ids), kwargs={}) 183 | 184 | # Convert the data back to long format 185 | df_corrected = df_corrected.reset_index() 186 | df_corrected_long = pivot_longer( 187 | df_corrected, 188 | row_name=protein_id_column, 189 | col_name=sample_id_column, 190 | values=ibaq_corrected_column, 191 | ) 192 | 193 | # Add the corrected ibaq values to the original dataframe. 194 | # Use sample/protein ID keys to merge the dataframes. 195 | df_ibaq = df_ibaq.merge( 196 | df_corrected_long, how="left", on=[sample_id_column, protein_id_column] 197 | ) 198 | 199 | # Save the corrected iBAQ values to a file 200 | if output: 201 | try: 202 | df_ibaq.to_csv(output, sep=sep, index=False) 203 | except Exception as e: 204 | raise ValueError(f"Failed to save output file: {str(e)}") 205 | 206 | # Export the raw and corrected iBAQ values to an AnnData object 207 | if export_anndata: 208 | logger.info("Exporting raw and corrected iBAQ values to an AnnData object") 209 | output_path = Path(output) 210 | if not output_path.exists(): 211 | raise FileNotFoundError(f"Output file {output} does not exist!") 212 | adata = create_anndata( 213 | df_ibaq, 214 | obs_col=sample_id_column, 215 | var_col=protein_id_column, 216 | value_col=ibaq_raw_column, 217 | layer_cols=[ibaq_corrected_column], 218 | ) 219 | adata_filename = output_path.with_suffix(".h5ad") 220 | try: 221 | adata.write(adata_filename) 222 | except Exception as e: 223 | raise ValueError(f"Failed to write AnnData object: {e}") 224 | 225 | logger.info("Batch correction completed...") 226 | 227 | return df_ibaq 228 | 229 | 230 | @click.command("correct-batches", short_help="Batch effect correction for iBAQ values.") 231 | @click.option( 232 | "-f", 233 | "--folder", 234 | help="Folder that contains all TSV files with raw iBAQ values", 235 | required=True, 236 | default=None, 237 | ) 238 | @click.option( 239 | "-p", 240 | "--pattern", 241 | help="Pattern for the TSV files with raw iBAQ values", 242 | required=True, 243 | default="*ibaq.tsv", 244 | ) 245 | @click.option( 246 | "--comment", 247 | help="Comment character for the TSV files. Lines starting with this character will be ignored.", 248 | required=False, 249 | default="#", 250 | ) 251 | @click.option("--sep", help="Separator for the TSV files", required=False, default="\t") 252 | @click.option( 253 | "-o", 254 | "--output", 255 | help="Output file name for the combined iBAQ corrected values", 256 | required=True, 257 | ) 258 | @click.option( 259 | "-sid", 260 | "--sample_id_column", 261 | help="Sample ID column name", 262 | required=False, 263 | default=SAMPLE_ID, 264 | ) 265 | @click.option( 266 | "-pid", 267 | "--protein_id_column", 268 | help="Protein ID column name", 269 | required=False, 270 | default=PROTEIN_NAME, 271 | ) 272 | @click.option( 273 | "-ibaq", "--ibaq_raw_column", help="Name of the raw iBAQ column", required=False, default=IBAQ 274 | ) 275 | @click.option( 276 | "--ibaq_corrected_column", 277 | help="Name for the corrected iBAQ column", 278 | required=False, 279 | default=IBAQ_BEC, 280 | ) 281 | @click.option( 282 | "--export_anndata", 283 | help="Export the raw and corrected iBAQ values to an AnnData object", 284 | is_flag=True, 285 | ) 286 | @click.pass_context 287 | def correct_batches( 288 | ctx, 289 | folder: str, 290 | pattern: str, 291 | comment: str, 292 | sep: str, 293 | output: str, 294 | sample_id_column: str, 295 | protein_id_column: str, 296 | ibaq_raw_column: str, 297 | ibaq_corrected_column: str, 298 | export_anndata: bool, 299 | ): 300 | """ 301 | Correcting batch effects in iBAQ data. 302 | 303 | This command processes TSV files containing raw iBAQ values, applies batch correction, 304 | and outputs the corrected values. It supports various options for specifying file patterns, 305 | column names, and output formats, including exporting to an AnnData file. 306 | """ 307 | run_batch_correction( 308 | folder=folder, 309 | pattern=pattern, 310 | comment=comment, 311 | sep=sep, 312 | output=output, 313 | sample_id_column=sample_id_column, 314 | protein_id_column=protein_id_column, 315 | ibaq_raw_column=ibaq_raw_column, 316 | ibaq_corrected_column=ibaq_corrected_column, 317 | export_anndata=export_anndata, 318 | ) 319 | -------------------------------------------------------------------------------- /ibaqpy/commands/features2peptides.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from ibaqpy.ibaq.peptide_normalization import peptide_normalization 4 | from ibaqpy.model.normalization import FeatureNormalizationMethod, PeptideNormalizationMethod 5 | 6 | 7 | @click.command("features2peptides", short_help="Convert features to parquet file.") 8 | @click.option( 9 | "-p", 10 | "--parquet", 11 | help="Parquet file import generated by quantms.io", 12 | required=True, 13 | type=click.Path(exists=True), 14 | ) 15 | @click.option( 16 | "-s", "--sdrf", help="SDRF file import generated by quantms", default=None, type=click.Path() 17 | ) 18 | @click.option( 19 | "--min_aa", help="Minimum number of amino acids to filter peptides", type=int, default=7 20 | ) 21 | @click.option( 22 | "--min_unique", 23 | help="Minimum number of unique peptides to filter proteins", 24 | default=2, 25 | type=int, 26 | ) 27 | @click.option( 28 | "--remove_ids", 29 | help="Remove specific protein ids from the analysis using a file with one id per line", 30 | type=click.Path(exists=True), 31 | ) 32 | @click.option( 33 | "--remove_decoy_contaminants", 34 | help="Remove decoy and contaminants proteins from the analysis", 35 | is_flag=True, 36 | default=False, 37 | ) 38 | @click.option( 39 | "--remove_low_frequency_peptides", 40 | help="Remove peptides that are present in less than 20% of the samples", 41 | is_flag=True, 42 | default=False, 43 | ) 44 | @click.option( 45 | "-o", 46 | "--output", 47 | help="Peptide intensity file including other all properties for normalization", 48 | type=click.Path(), 49 | ) 50 | @click.option("--skip_normalization", help="Skip normalization step", is_flag=True, default=False) 51 | @click.option( 52 | "--nmethod", 53 | help="Normalization method used to normalize feature intensities for tec (options: mean, median, iqr, none)", 54 | default="median", 55 | type=click.Choice([f.name.lower() for f in FeatureNormalizationMethod], case_sensitive=False), 56 | ) 57 | @click.option( 58 | "--pnmethod", 59 | help="Normalization method used to normalize peptides intensities for all samples (options:globalMedian, conditionMedian)", 60 | default="globalMedian", 61 | type=click.Choice([p.name.lower() for p in PeptideNormalizationMethod], case_sensitive=False), 62 | ) 63 | @click.option( 64 | "--log2", 65 | help="Transform to log2 the peptide intensity values before normalization", 66 | is_flag=True, 67 | ) 68 | @click.option( 69 | "--save_parquet", 70 | help="Save normalized peptides to parquet", 71 | is_flag=True, 72 | ) 73 | @click.pass_context 74 | def features2parquet( 75 | ctx, 76 | parquet: str, 77 | sdrf: str, 78 | min_aa: int, 79 | min_unique: int, 80 | remove_ids: str, 81 | remove_decoy_contaminants: bool, 82 | remove_low_frequency_peptides: bool, 83 | output: str, 84 | skip_normalization: bool, 85 | nmethod: str, 86 | pnmethod: str, 87 | log2: bool, 88 | save_parquet: bool, 89 | ) -> None: 90 | """ 91 | Convert feature data to a parquet file with optional normalization and filtering steps. 92 | """ 93 | 94 | peptide_normalization( 95 | parquet=parquet, 96 | sdrf=sdrf, 97 | min_aa=min_aa, 98 | min_unique=min_unique, 99 | remove_ids=remove_ids, 100 | remove_decoy_contaminants=remove_decoy_contaminants, 101 | remove_low_frequency_peptides=remove_low_frequency_peptides, 102 | output=output, 103 | skip_normalization=skip_normalization, 104 | nmethod=nmethod, 105 | pnmethod=pnmethod, 106 | log2=log2, 107 | save_parquet=save_parquet, 108 | ) 109 | -------------------------------------------------------------------------------- /ibaqpy/commands/peptides2protein.py: -------------------------------------------------------------------------------- 1 | import click 2 | from ibaqpy.ibaq.peptides2protein import peptides_to_protein 3 | from ibaqpy.model.organism_metadata import OrganismDescription 4 | 5 | 6 | @click.command("peptides2protein", short_help="Compute IBAQ values for proteins") 7 | @click.option( 8 | "-f", 9 | "--fasta", 10 | help="Protein database to compute IBAQ values", 11 | required=True, 12 | type=click.Path(exists=True), 13 | ) 14 | @click.option( 15 | "-p", 16 | "--peptides", 17 | help="Peptide identifications with intensities following the peptide intensity output", 18 | required=True, 19 | type=click.Path(exists=True), 20 | ) 21 | @click.option( 22 | "-e", 23 | "--enzyme", 24 | help="Enzyme used during the analysis of the dataset (default: Trypsin)", 25 | default="Trypsin", 26 | ) 27 | @click.option( 28 | "-n", 29 | "--normalize", 30 | help="Normalize IBAQ values using by using the total IBAQ of the experiment", 31 | is_flag=True, 32 | ) 33 | @click.option("--min_aa", help="Minimum number of amino acids to consider a peptide", default=7) 34 | @click.option("--max_aa", help="Maximum number of amino acids to consider a peptide", default=30) 35 | @click.option("-t", "--tpa", help="Whether calculate TPA", is_flag=True) 36 | @click.option("-r", "--ruler", help="Whether to use ProteomicRuler", is_flag=True) 37 | @click.option("-i", "--ploidy", help="Ploidy number (default: 2)", default=2) 38 | @click.option( 39 | "-m", 40 | "--organism", 41 | help="Organism source of the data (default: human)", 42 | type=click.Choice( 43 | sorted(map(str.lower, OrganismDescription.registered_organisms())), case_sensitive=False 44 | ), 45 | default="human", 46 | ) 47 | @click.option( 48 | "-c", "--cpc", help="Cellular protein concentration(g/L) (default: 200)", default=200 49 | ) 50 | @click.option("-o", "--output", help="Output file with the proteins and ibaq values") 51 | @click.option( 52 | "--verbose", 53 | help="Print addition information about the distributions of the intensities, number of peptides remove " 54 | "after normalization, etc.", 55 | is_flag=True, 56 | ) 57 | @click.option( 58 | "--qc_report", 59 | help="PDF file to store multiple QC images", 60 | default="QCprofile.pdf", 61 | ) 62 | @click.pass_context 63 | def peptides2protein( 64 | click_context, 65 | fasta: str, 66 | peptides: str, 67 | enzyme: str, 68 | normalize: bool, 69 | min_aa: int, 70 | max_aa: int, 71 | tpa: bool, 72 | ruler: bool, 73 | organism: str, 74 | ploidy: int, 75 | cpc: float, 76 | output: str, 77 | verbose: bool, 78 | qc_report: str, 79 | ) -> None: 80 | """ 81 | Compute IBAQ values for proteins from peptide intensity data. 82 | 83 | This command processes peptide identifications and computes IBAQ values, 84 | optionally normalizing the data and calculating protein metrics using a 85 | proteomic ruler approach. It supports generating a QC report with distribution 86 | plots if verbose mode is enabled. 87 | """ 88 | peptides_to_protein( 89 | fasta=fasta, 90 | peptides=peptides, 91 | enzyme=enzyme, 92 | normalize=normalize, 93 | min_aa=min_aa, 94 | max_aa=max_aa, 95 | tpa=tpa, 96 | ruler=ruler, 97 | ploidy=ploidy, 98 | cpc=cpc, 99 | organism=organism, 100 | output=output, 101 | verbose=verbose, 102 | qc_report=qc_report, 103 | ) 104 | -------------------------------------------------------------------------------- /ibaqpy/commands/tsne_visualization.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import logging 3 | 4 | import click 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | import pandas as pd 8 | import seaborn as sns 9 | from sklearn.decomposition import PCA 10 | from sklearn.manifold import TSNE 11 | 12 | from ibaqpy.ibaq.ibaqpy_commons import PROTEIN_NAME, SAMPLE_ID, IBAQ_LOG 13 | 14 | 15 | logger = logging.getLogger(__name__) 16 | logger.addHandler(logging.NullHandler()) 17 | 18 | 19 | # function to compute principal components 20 | def compute_pca(df, n_components=5) -> pd.DataFrame: 21 | """ 22 | Compute principal components for a given dataframe. 23 | 24 | Parameters: 25 | df : pd.DataFrame 26 | Input dataframe with samples as rows and features as columns. 27 | n_components : int, optional 28 | The number of principal components to compute (default is 5). 29 | Returns: 30 | df_pca : pd.DataFrame 31 | A dataframe with the principal components. 32 | """ 33 | 34 | pca = PCA(n_components=n_components) 35 | pca.fit(df) 36 | df_pca = pca.transform(df) 37 | 38 | df_pca = pd.DataFrame( 39 | df_pca, index=df.index, columns=[f"PC{i}" for i in range(1, n_components + 1)] 40 | ) 41 | 42 | plt.rcParams["figure.figsize"] = (12, 6) 43 | 44 | fig, ax = plt.subplots() 45 | xi = np.arange(1, n_components + 1, step=1) 46 | y = np.cumsum(pca.explained_variance_ratio_) 47 | 48 | plt.ylim(0.0, 1.1) 49 | plt.plot(xi, y, marker="o", linestyle="--", color="b") 50 | 51 | plt.xlabel("Number of Components") 52 | plt.xticks( 53 | np.arange(0, n_components, step=1) 54 | ) # change from 0-based array index to 1-based human-readable label 55 | plt.ylabel("Cumulative variance (%)") 56 | plt.title("The number of components needed to explain variance") 57 | 58 | plt.axhline(y=0.95, color="r", linestyle="-") 59 | plt.text(0.5, 0.85, "95% cut-off threshold", color="red", fontsize=16) 60 | 61 | ax.grid(axis="x") 62 | plt.show() 63 | 64 | return df_pca 65 | 66 | 67 | def compute_tsne(df_pca, n_components=2, perplexity=30, learning_rate=200, n_iter=2000): 68 | """ 69 | Compute t-SNE components from PCA components. 70 | 71 | This function applies t-SNE (t-Distributed Stochastic Neighbor Embedding) to the input DataFrame, 72 | which is expected to contain PCA components with samples as rows. The output is another DataFrame 73 | that contains t-SNE components, also with samples as rows. 74 | 75 | Parameters: 76 | df_pca : pandas DataFrame 77 | Input DataFrame containing PCA components. Rows are samples and columns are PCA components. 78 | n_components : int, optional 79 | The number of dimensions for the t-SNE components (default is 2). 80 | perplexity : float, optional 81 | The perplexity parameter for t-SNE, which can influence the balance between maintaining 82 | the local and global structure of the data (default is 30). 83 | learning_rate : float, optional 84 | The learning rate for t-SNE (default is 200). 85 | n_iter : int, optional 86 | The number of iterations for t-SNE optimization (default is 2000). 87 | 88 | Returns: 89 | df_tsne : pandas DataFrame 90 | Output DataFrame containing t-SNE components. Rows are samples and columns are t-SNE components. 91 | 92 | Example 93 | ------- 94 | df_pca = pd.DataFrame(data, columns=['PC1', 'PC2', 'PC3']) 95 | df_tsne = compute_tsne(df_pca) 96 | """ 97 | 98 | tsne = TSNE( 99 | n_components=n_components, 100 | perplexity=perplexity, 101 | learning_rate=learning_rate, 102 | n_iter=n_iter, 103 | ) 104 | tsne_results = tsne.fit_transform(np.asarray(df_pca)) 105 | 106 | tsne_cols = [f"tSNE{i + 1}" for i in range(n_components)] 107 | 108 | df_tsne = pd.DataFrame(data=tsne_results, columns=tsne_cols) 109 | df_tsne.index = df_pca.index 110 | return df_tsne 111 | 112 | 113 | def plot_tsne(df, x_col, y_col, hue_col, file_name): 114 | """ 115 | Generate and save a t-SNE scatter plot from a DataFrame. 116 | 117 | This function creates a scatter plot using seaborn's scatterplot function, 118 | with the specified columns for the x-axis, y-axis, and hue. The plot is 119 | customized with labels, a title, and a legend positioned inside the plot. 120 | The resulting plot is saved to the specified file. 121 | 122 | Parameters: 123 | df (pd.DataFrame): The DataFrame containing the data to plot. 124 | x_col (str): The column name for the x-axis values. 125 | y_col (str): The column name for the y-axis values. 126 | hue_col (str): The column name for the hue (color) values. 127 | file_name (str): The file path where the plot image will be saved. 128 | """ 129 | fig, ax = plt.subplots(1, 1, figsize=(20, 10)) 130 | sns.scatterplot(x=x_col, y=y_col, hue=hue_col, data=df, ax=ax, markers=["o", "+", "x"]) 131 | ax.set_xlabel(x_col) 132 | ax.set_ylabel(y_col) 133 | ax.set_title(f"{x_col} vs {y_col} with {hue_col} information") 134 | # set legend inside the plot left an upper corner 135 | plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=8) 136 | plt.subplots_adjust(right=0.8) 137 | plt.savefig(file_name) 138 | 139 | 140 | @click.command() 141 | @click.option("-f", "--folder", help="Folder that contains all the protein files", required=True) 142 | @click.option( 143 | "-o", 144 | "--pattern", 145 | help="Protein file pattern", 146 | # TODO: I think we should use instead of pattern the structure of quantms.io for absolute quantification 147 | required=False, 148 | default="proteins.tsv", 149 | ) 150 | def tsne_visualization(folder: str, pattern: str): 151 | """ 152 | Generate a t-SNE visualization for protein data from specified files. 153 | 154 | This command-line tool reads protein data files from a specified folder, 155 | applies PCA and t-SNE for dimensionality reduction, and generates a scatter 156 | plot of the t-SNE components. The plot is saved as a PDF file. 157 | 158 | Parameters: 159 | folder (str): The folder containing protein data files. 160 | pattern (str): The file pattern to match protein files. Defaults to 'proteins.tsv'. 161 | """ 162 | # get all the files in the folder 163 | files = glob.glob(f"{folder}/*{pattern}") 164 | 165 | # get the files into pandas selected columns 166 | # (Proteins accession, Sample ID, Reanalysis accession, Intensity) 167 | 168 | dfs = [] # list of dataframes 169 | 170 | for f in files: 171 | reanalysis = (f.split("/")[-1].split("_")[0]).replace("-proteins.tsv", "") 172 | dfs += [ 173 | pd.read_csv(f, usecols=[PROTEIN_NAME, SAMPLE_ID, IBAQ_LOG], sep=",").assign( 174 | reanalysis=reanalysis 175 | ) 176 | ] 177 | 178 | total_proteins = pd.concat(dfs, ignore_index=True) 179 | 180 | normalize_df = pd.pivot_table( 181 | total_proteins, 182 | index=[SAMPLE_ID, "reanalysis"], 183 | columns=PROTEIN_NAME, 184 | values=IBAQ_LOG, 185 | ) 186 | normalize_df = normalize_df.fillna(0) 187 | df_pca = compute_pca(normalize_df, n_components=30) 188 | df_tsne = compute_tsne(df_pca) 189 | 190 | batch = df_tsne.index.get_level_values("reanalysis").tolist() 191 | df_tsne["batch"] = batch 192 | 193 | # plot the t-SNE components tSNE1 vs tSNE2 with batch information using seaborn 194 | plot_tsne(df_tsne, "tSNE1", "tSNE2", "batch", "5.tsne_plot_with_batch_information.pdf") 195 | 196 | logger.info(total_proteins.shape) 197 | 198 | 199 | if __name__ == "__main__": 200 | tsne_visualization() 201 | -------------------------------------------------------------------------------- /ibaqpy/data/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /ibaqpy/data/data.py: -------------------------------------------------------------------------------- 1 | histones = { 2 | "HUMAN": { 3 | "name": "human", 4 | "genome_size": 3220000000, 5 | "histone_proteins": [ 6 | "P07305", 7 | "Q8IZA3", 8 | "Q92522", 9 | "P0C5Y9", 10 | "P0C5Z0", 11 | "H0YFX9", 12 | "Q9BTM1", 13 | "A8MQC5", 14 | "C9J0D1", 15 | "C9J386", 16 | "E5RJU1", 17 | "Q71UI9", 18 | "P16104", 19 | "B4DJC3", 20 | "D6RCF2", 21 | "O75367", 22 | "Q5SQT3", 23 | "Q9P0M6", 24 | "P0C0S5", 25 | "P0C1H6", 26 | "A9UJN3", 27 | "P57053", 28 | "Q7Z2G1", 29 | "B4DEB1", 30 | "P84243", 31 | "B2R4P9", 32 | "K7EMV3", 33 | "K7ES00", 34 | "K7EK07", 35 | "K7EP01", 36 | "Q6NXT2", 37 | "Q02539", 38 | "P16401", 39 | "P16403", 40 | "P16402", 41 | "Q4VB24", 42 | "P10412", 43 | "A3R0T8", 44 | "A1L407", 45 | "P22492", 46 | "Q96QV6", 47 | "P04908", 48 | "Q08AJ9", 49 | "Q93077", 50 | "P20671", 51 | "P0C0S8", 52 | "A3KPC7", 53 | "Q96KK5", 54 | "Q99878", 55 | "A4FTV9", 56 | "Q92646", 57 | "Q96A08", 58 | "P33778", 59 | "P62807", 60 | "P58876", 61 | "B2R4S9", 62 | "Q93079", 63 | "P06899", 64 | "O60814", 65 | "Q99880", 66 | "I6L9F7", 67 | "Q99879", 68 | "Q99877", 69 | "P23527", 70 | "P68431", 71 | "P62805", 72 | "Q99525", 73 | "Q0VAS5", 74 | "B2R4R0", 75 | "Q6FI13", 76 | "Q8IUE6", 77 | "Q16777", 78 | "Q16778", 79 | "B4DR52", 80 | "Q5QNW6", 81 | "Q71DI3", 82 | "Q5TEC6", 83 | "Q7L7L0", 84 | "Q8N257", 85 | "Q16695", 86 | "Q6TXQ4", 87 | "Q14463", 88 | "B4E0B3", 89 | "B2R5B6", 90 | "A2RUA4", 91 | "B2R5B3", 92 | "Q9HA11", 93 | "A8K9J7", 94 | "B2R6Y1", 95 | "B4E380", 96 | "A8K4Y7", 97 | "Q6B823", 98 | "Q6LBZ2", 99 | "A3R0T7", 100 | ], 101 | "histone_entries": [ 102 | "H2AW_HUMAN", 103 | "Q9HA11_HUMAN", 104 | "H2AJ_HUMAN", 105 | "H2B1L_HUMAN", 106 | "H2B1M_HUMAN", 107 | "H2A1J_HUMAN", 108 | "H2B1N_HUMAN", 109 | "H4G_HUMAN", 110 | "H2A1A_HUMAN", 111 | "H2A1H_HUMAN", 112 | "H2B1A_HUMAN", 113 | "H2B1H_HUMAN", 114 | "H2A1C_HUMAN", 115 | "Q92646_HUMAN", 116 | "H1X_HUMAN", 117 | "H2B3B_HUMAN", 118 | "H18_HUMAN", 119 | "H2A2B_HUMAN", 120 | "H2BWT_HUMAN", 121 | "H2A3_HUMAN", 122 | "H2AV_HUMAN", 123 | "H2AV_HUMAN", 124 | "H32_HUMAN", 125 | "Q6TXQ4_HUMAN", 126 | "H3C_HUMAN", 127 | "Q6LBZ2_HUMAN", 128 | "H2A2A_HUMAN", 129 | "Q6B823_HUMAN", 130 | "H37_HUMAN", 131 | "Q5SQT3_HUMAN", 132 | "H2B2F_HUMAN", 133 | "Q4VB24_HUMAN", 134 | "H2B2E_HUMAN", 135 | "H2A2C_HUMAN", 136 | "H31T_HUMAN", 137 | "Q14463_HUMAN", 138 | "Q0VAS5_HUMAN", 139 | "Q08AJ9_HUMAN", 140 | "H11_HUMAN", 141 | "H33_HUMAN", 142 | "H31_HUMAN", 143 | "H2B1C_HUMAN", 144 | "H4_HUMAN", 145 | "H2B1D_HUMAN", 146 | "H2BFS_HUMAN", 147 | "H2B1B_HUMAN", 148 | "H2B1O_HUMAN", 149 | "H1T_HUMAN", 150 | "H2A1D_HUMAN", 151 | "H12_HUMAN", 152 | "H13_HUMAN", 153 | "H15_HUMAN", 154 | "H2AX_HUMAN", 155 | "H14_HUMAN", 156 | "H2AB2_HUMAN", 157 | "H2AB1_HUMAN", 158 | "H2BFM_HUMAN", 159 | "H2A1_HUMAN", 160 | "H2AZ_HUMAN", 161 | "H10_HUMAN", 162 | "H2B1J_HUMAN", 163 | "H2A1B_HUMAN", 164 | "H2AY_HUMAN", 165 | "H2B1K_HUMAN", 166 | "K7ES00_HUMAN", 167 | "K7EP01_HUMAN", 168 | "K7EMV3_HUMAN", 169 | "K7EK07_HUMAN", 170 | "I6L9F7_HUMAN", 171 | "H0YFX9_HUMAN", 172 | "E5RJU1_HUMAN", 173 | "D6RCF2_HUMAN", 174 | "C9J386_HUMAN", 175 | "C9J0D1_HUMAN", 176 | "B4E380_HUMAN", 177 | "B4E0B3_HUMAN", 178 | "B4DR52_HUMAN", 179 | "B4DJC3_HUMAN", 180 | "B4DEB1_HUMAN", 181 | "B2R6Y1_HUMAN", 182 | "B2R5B6_HUMAN", 183 | "B2R5B3_HUMAN", 184 | "B2R4S9_HUMAN", 185 | "B2R4R0_HUMAN", 186 | "B2R4P9_HUMAN", 187 | "A9UJN3_HUMAN", 188 | "A8K9J7_HUMAN", 189 | "A8K4Y7_HUMAN", 190 | "A4FTV9_HUMAN", 191 | "A3R0T8_HUMAN", 192 | "A3R0T7_HUMAN", 193 | "A3KPC7_HUMAN", 194 | "A2RUA4_HUMAN", 195 | "A1L407_HUMAN", 196 | ], 197 | }, 198 | "MOUSE": { 199 | "name": "mouse", 200 | "genome_size": 2800000000, 201 | "histone_proteins": [ 202 | "Q9DAD9", 203 | "B2RTM0", 204 | "Q8CBB6", 205 | "Q921L4", 206 | "Q5M8Q2", 207 | "Q810S6", 208 | "B1AV31", 209 | "Q497L1", 210 | "A9Z055", 211 | "Q8CGP9", 212 | "P10922", 213 | "Q8CJI4", 214 | "E0CZ52", 215 | "E0CYL2", 216 | "Q8VIK3", 217 | "Q80ZM5", 218 | "Q9CQ70", 219 | "Q8R1M2", 220 | "Q3THW5", 221 | "Q8R029", 222 | "B2RVP5", 223 | "P27661", 224 | "Q9QZQ8", 225 | "Q8CA90", 226 | "Q8BP16", 227 | "Q9CTR1", 228 | "Q8CCK0", 229 | "Q9D3V6", 230 | "Q9D3U7", 231 | "Q3UA95", 232 | "Q3TFU6", 233 | "G3UWL7", 234 | "G3UX40", 235 | "P0C0S6", 236 | "F8WI35", 237 | "E0CZ27", 238 | "E0CYN1", 239 | "E0CYR7", 240 | "P84244", 241 | "P02301", 242 | "Q9QYL0", 243 | "P43275", 244 | "P43276", 245 | "P15864", 246 | "Q5SZA3", 247 | "P43277", 248 | "Q149Z9", 249 | "P43274", 250 | "Q07133", 251 | "I7HFT9", 252 | "Q8CGP4", 253 | "P22752", 254 | "B2RVF0", 255 | "Q61668", 256 | "Q8CGP5", 257 | "A0AUV1", 258 | "Q8CGP6", 259 | "A3KPD0", 260 | "Q8CGP7", 261 | "F8WIX8", 262 | "A0JNS9", 263 | "P70696", 264 | "Q64475", 265 | "Q6ZWY9", 266 | "P10853", 267 | "Q64478", 268 | "A0JLV3", 269 | "Q8CGP1", 270 | "B2RVD5", 271 | "P10854", 272 | "B2RTK3", 273 | "Q8CGP2", 274 | "P68433", 275 | "P84228", 276 | "A1L0U3", 277 | "A1L0V4", 278 | "P62806", 279 | "B2RWH3", 280 | "Q6GSS7", 281 | "Q64522", 282 | "Q64523", 283 | "Q149V4", 284 | "Q64525", 285 | "G3X9D5", 286 | "Q64524", 287 | "B9EI85", 288 | "Q61667", 289 | "Q8BFU2", 290 | "A2AB79", 291 | "Q9D2U9", 292 | "Q8CGP0", 293 | "Q6B822", 294 | "P07978", 295 | "Q9D9Z7", 296 | ], 297 | }, 298 | "DROME": { 299 | "name": "drome", 300 | "genome_size": 144000000, 301 | "histone_proteins": [ 302 | "Q6TXQ1", 303 | "P02255", 304 | "Q4AB54", 305 | "Q4ABE3", 306 | "Q4ABD8", 307 | "Q4AB94", 308 | "P84051", 309 | "Q4AB57", 310 | "P08985", 311 | "P02283", 312 | "P02299", 313 | "E2QCP0", 314 | "P84249", 315 | "P84040", 316 | ], 317 | "histone_entries": [ 318 | "Q9DAD9_MOUSE", 319 | "B2RTM0_MOUSE", 320 | "Q8CBB6_MOUSE", 321 | "Q921L4_MOUSE", 322 | "H2AL1_MOUSE", 323 | "Q810S6_MOUSE", 324 | "Q9DAD9_MOUSE", 325 | "Q497L1_MOUSE", 326 | "A9Z055_MOUSE", 327 | "Q8CGP9_MOUSE", 328 | "H10_MOUSE", 329 | "H1FNT_MOUSE", 330 | "E0CZ52_MOUSE", 331 | "E0CYL2_MOUSE", 332 | "H18_MOUSE", 333 | "Q80ZM5_MOUSE", 334 | "H2AB1_MOUSE", 335 | "H2AJ_MOUSE", 336 | "H2AV_MOUSE", 337 | "Q8R029_MOUSE", 338 | "B2RVP5_MOUSE", 339 | "H2AX_MOUSE", 340 | "H2AY_MOUSE", 341 | "Q8CA90_MOUSE", 342 | "Q8BP16_MOUSE", 343 | "Q9CTR1_MOUSE", 344 | "H2AW_MOUSE", 345 | "Q9D3V6_MOUSE", 346 | "Q9D3U7_MOUSE", 347 | "Q3UA95_MOUSE", 348 | "Q3TFU6_MOUSE", 349 | "G3UWL7_MOUSE", 350 | "G3UX40_MOUSE", 351 | "H2AZ_MOUSE", 352 | "F8WI35_MOUSE", 353 | "E0CZ27_MOUSE", 354 | "E0CYN1_MOUSE", 355 | "E0CYR7_MOUSE", 356 | "H33_MOUSE", 357 | "H3C_MOUSE", 358 | "HILS1_MOUSE", 359 | "H11_MOUSE", 360 | "H15_MOUSE", 361 | "H12_MOUSE", 362 | "Q5SZA3_MOUSE", 363 | "H13_MOUSE", 364 | "Q149Z9_MOUSE", 365 | "H14_MOUSE", 366 | "H1T_MOUSE", 367 | "I7HFT9_MOUSE", 368 | "Q8CGP4_MOUSE", 369 | "H2A1P_MOUSE", 370 | "H2A1B_MOUSE", 371 | "H2A1C_MOUSE", 372 | "H2A1D_MOUSE", 373 | "H2A1E_MOUSE", 374 | "H2A1G_MOUSE", 375 | "H2A1I_MOUSE", 376 | "H2A1N_MOUSE", 377 | "H2A1O_MOUSE", 378 | "B2RVF0_MOUSE", 379 | "Q61668_MOUSE", 380 | "H2A1F_MOUSE", 381 | "A0AUV1_MOUSE", 382 | "H2A1H_MOUSE", 383 | "A3KPD0_MOUSE", 384 | "H2A1K_MOUSE", 385 | "A0JNS9_MOUSE", 386 | "H2B1A_MOUSE", 387 | "H2B1B_MOUSE", 388 | "H2B1C_MOUSE", 389 | "H2B1F_MOUSE", 390 | "H2B1H_MOUSE", 391 | "A0JLV3_MOUSE", 392 | "H2B1K_MOUSE", 393 | "B2RVD5_MOUSE", 394 | "H2B1M_MOUSE", 395 | "B2RTK3_MOUSE", 396 | "H2B1P_MOUSE", 397 | "H31_MOUSE", 398 | "H32_MOUSE", 399 | "A1L0U3_MOUSE", 400 | "A1L0V4_MOUSE", 401 | "H4_MOUSE", 402 | "B2RWH3_MOUSE", 403 | "H2A2A_MOUSE", 404 | "H2A2B_MOUSE", 405 | "H2A2C_MOUSE", 406 | "Q149V4_MOUSE", 407 | "H2B2B_MOUSE", 408 | "H2B2E_MOUSE", 409 | "B9EI85_MOUSE", 410 | "Q61667_MOUSE", 411 | "H2A3_MOUSE", 412 | "A2AB79_MOUSE", 413 | "H2B3A_MOUSE", 414 | "H2B3B_MOUSE", 415 | "Q6B822_MOUSE", 416 | "PRM2_MOUSE", 417 | "H2BL1_MOUSE", 418 | ], 419 | }, 420 | "CAEEL": { 421 | "name": "caeel", 422 | "genome_size": 104000000, 423 | "histone_proteins": [ 424 | "P10771", 425 | "P15796", 426 | "Q19743", 427 | "O17536", 428 | "O01833", 429 | "Q9U3W3", 430 | "Q18336", 431 | "P09588", 432 | "J7S164", 433 | "J7SA65", 434 | "Q27485", 435 | "Q23429", 436 | "Q27511", 437 | "P04255", 438 | "Q27894", 439 | "P08898", 440 | "K7ZUH9", 441 | "Q10453", 442 | "Q9U281", 443 | "Q27490", 444 | "Q27532", 445 | "P62784", 446 | "Q27484", 447 | "Q27876", 448 | "O16277", 449 | "Q27489", 450 | ], 451 | "histone_entries": [ 452 | "H24_CAEEL", 453 | "H12_CAEEL", 454 | "H13_CAEEL", 455 | "H14_CAEEL", 456 | "H15_CAEEL", 457 | "Q9U3W3_CAEEL", 458 | "H1X_CAEEL", 459 | "H2A_CAEEL", 460 | "J7S164_CAEEL", 461 | "J7SA65_CAEEL", 462 | "Q27485_CAEEL", 463 | "Q23429_CAEEL", 464 | "H2AV_CAEEL", 465 | "H2B1_CAEEL", 466 | "H2B2_CAEEL", 467 | "H3_CAEEL", 468 | "K7ZUH9_CAEEL", 469 | "H331_CAEEL", 470 | "H332_CAEEL", 471 | "H33L1_CAEEL", 472 | "H33L2_CAEEL", 473 | "H4_CAEEL", 474 | "H2B3_CAEEL", 475 | "H2B4_CAEEL", 476 | "H16_CAEEL", 477 | "H33L3_CAEEL", 478 | ], 479 | }, 480 | "YEAST": { 481 | "name": "yeast", 482 | "genome_size": 12100000, 483 | "histone_proteins": [ 484 | "P53551", 485 | "P04911", 486 | "P04912", 487 | "Q12692", 488 | "P02293", 489 | "P02294", 490 | "P61830", 491 | "P02309", 492 | ], 493 | "histone_entries": [ 494 | "H1_YEAST", 495 | "H2A1_YEAST", 496 | "H2A2_YEAST", 497 | "H2AZ_YEAST", 498 | "H2B1_YEAST", 499 | "H2B2_YEAST", 500 | "H3_YEAST", 501 | "H4_YEAST", 502 | ], 503 | }, 504 | "SCHPO": { 505 | "name": "schpo", 506 | "genome_size": 14100000, 507 | "histone_proteins": ["P48003", "P04909", "P04910", "P04913", "P09988", "P10651", "P09322"], 508 | "histone_entries": [ 509 | "H2AZ_SCHPO", 510 | "H2A1_SCHPO", 511 | "H2A2_SCHPO", 512 | "H2B1_SCHPO", 513 | "H31_SCHPO", 514 | "H33_SCHPO", 515 | "H4_SCHPO", 516 | ], 517 | }, 518 | } 519 | -------------------------------------------------------------------------------- /ibaqpy/data/organisms.json: -------------------------------------------------------------------------------- 1 | { 2 | "HUMAN": { 3 | "name": "human", 4 | "genome_size": 3220000000, 5 | "histone_proteins": [ 6 | "P07305", 7 | "Q8IZA3", 8 | "Q92522", 9 | "P0C5Y9", 10 | "P0C5Z0", 11 | "H0YFX9", 12 | "Q9BTM1", 13 | "A8MQC5", 14 | "C9J0D1", 15 | "C9J386", 16 | "E5RJU1", 17 | "Q71UI9", 18 | "P16104", 19 | "B4DJC3", 20 | "D6RCF2", 21 | "O75367", 22 | "Q5SQT3", 23 | "Q9P0M6", 24 | "P0C0S5", 25 | "P0C1H6", 26 | "A9UJN3", 27 | "P57053", 28 | "Q7Z2G1", 29 | "B4DEB1", 30 | "P84243", 31 | "B2R4P9", 32 | "K7EMV3", 33 | "K7ES00", 34 | "K7EK07", 35 | "K7EP01", 36 | "Q6NXT2", 37 | "Q02539", 38 | "P16401", 39 | "P16403", 40 | "P16402", 41 | "Q4VB24", 42 | "P10412", 43 | "A3R0T8", 44 | "A1L407", 45 | "P22492", 46 | "Q96QV6", 47 | "P04908", 48 | "Q08AJ9", 49 | "Q93077", 50 | "P20671", 51 | "P0C0S8", 52 | "A3KPC7", 53 | "Q96KK5", 54 | "Q99878", 55 | "A4FTV9", 56 | "Q92646", 57 | "Q96A08", 58 | "P33778", 59 | "P62807", 60 | "P58876", 61 | "B2R4S9", 62 | "Q93079", 63 | "P06899", 64 | "O60814", 65 | "Q99880", 66 | "I6L9F7", 67 | "Q99879", 68 | "Q99877", 69 | "P23527", 70 | "P68431", 71 | "P62805", 72 | "Q99525", 73 | "Q0VAS5", 74 | "B2R4R0", 75 | "Q6FI13", 76 | "Q8IUE6", 77 | "Q16777", 78 | "Q16778", 79 | "B4DR52", 80 | "Q5QNW6", 81 | "Q71DI3", 82 | "Q5TEC6", 83 | "Q7L7L0", 84 | "Q8N257", 85 | "Q16695", 86 | "Q6TXQ4", 87 | "Q14463", 88 | "B4E0B3", 89 | "B2R5B6", 90 | "A2RUA4", 91 | "B2R5B3", 92 | "Q9HA11", 93 | "A8K9J7", 94 | "B2R6Y1", 95 | "B4E380", 96 | "A8K4Y7", 97 | "Q6B823", 98 | "Q6LBZ2", 99 | "A3R0T7" 100 | ], 101 | "histone_entries": [ 102 | "H2AW_HUMAN", 103 | "Q9HA11_HUMAN", 104 | "H2AJ_HUMAN", 105 | "H2B1L_HUMAN", 106 | "H2B1M_HUMAN", 107 | "H2A1J_HUMAN", 108 | "H2B1N_HUMAN", 109 | "H4G_HUMAN", 110 | "H2A1A_HUMAN", 111 | "H2A1H_HUMAN", 112 | "H2B1A_HUMAN", 113 | "H2B1H_HUMAN", 114 | "H2A1C_HUMAN", 115 | "Q92646_HUMAN", 116 | "H1X_HUMAN", 117 | "H2B3B_HUMAN", 118 | "H18_HUMAN", 119 | "H2A2B_HUMAN", 120 | "H2BWT_HUMAN", 121 | "H2A3_HUMAN", 122 | "H2AV_HUMAN", 123 | "H2AV_HUMAN", 124 | "H32_HUMAN", 125 | "Q6TXQ4_HUMAN", 126 | "H3C_HUMAN", 127 | "Q6LBZ2_HUMAN", 128 | "H2A2A_HUMAN", 129 | "Q6B823_HUMAN", 130 | "H37_HUMAN", 131 | "Q5SQT3_HUMAN", 132 | "H2B2F_HUMAN", 133 | "Q4VB24_HUMAN", 134 | "H2B2E_HUMAN", 135 | "H2A2C_HUMAN", 136 | "H31T_HUMAN", 137 | "Q14463_HUMAN", 138 | "Q0VAS5_HUMAN", 139 | "Q08AJ9_HUMAN", 140 | "H11_HUMAN", 141 | "H33_HUMAN", 142 | "H31_HUMAN", 143 | "H2B1C_HUMAN", 144 | "H4_HUMAN", 145 | "H2B1D_HUMAN", 146 | "H2BFS_HUMAN", 147 | "H2B1B_HUMAN", 148 | "H2B1O_HUMAN", 149 | "H1T_HUMAN", 150 | "H2A1D_HUMAN", 151 | "H12_HUMAN", 152 | "H13_HUMAN", 153 | "H15_HUMAN", 154 | "H2AX_HUMAN", 155 | "H14_HUMAN", 156 | "H2AB2_HUMAN", 157 | "H2AB1_HUMAN", 158 | "H2BFM_HUMAN", 159 | "H2A1_HUMAN", 160 | "H2AZ_HUMAN", 161 | "H10_HUMAN", 162 | "H2B1J_HUMAN", 163 | "H2A1B_HUMAN", 164 | "H2AY_HUMAN", 165 | "H2B1K_HUMAN", 166 | "K7ES00_HUMAN", 167 | "K7EP01_HUMAN", 168 | "K7EMV3_HUMAN", 169 | "K7EK07_HUMAN", 170 | "I6L9F7_HUMAN", 171 | "H0YFX9_HUMAN", 172 | "E5RJU1_HUMAN", 173 | "D6RCF2_HUMAN", 174 | "C9J386_HUMAN", 175 | "C9J0D1_HUMAN", 176 | "B4E380_HUMAN", 177 | "B4E0B3_HUMAN", 178 | "B4DR52_HUMAN", 179 | "B4DJC3_HUMAN", 180 | "B4DEB1_HUMAN", 181 | "B2R6Y1_HUMAN", 182 | "B2R5B6_HUMAN", 183 | "B2R5B3_HUMAN", 184 | "B2R4S9_HUMAN", 185 | "B2R4R0_HUMAN", 186 | "B2R4P9_HUMAN", 187 | "A9UJN3_HUMAN", 188 | "A8K9J7_HUMAN", 189 | "A8K4Y7_HUMAN", 190 | "A4FTV9_HUMAN", 191 | "A3R0T8_HUMAN", 192 | "A3R0T7_HUMAN", 193 | "A3KPC7_HUMAN", 194 | "A2RUA4_HUMAN", 195 | "A1L407_HUMAN" 196 | ] 197 | }, 198 | "MOUSE": { 199 | "name": "mouse", 200 | "genome_size": 2800000000, 201 | "histone_proteins": [ 202 | "Q9DAD9", 203 | "B2RTM0", 204 | "Q8CBB6", 205 | "Q921L4", 206 | "Q5M8Q2", 207 | "Q810S6", 208 | "B1AV31", 209 | "Q497L1", 210 | "A9Z055", 211 | "Q8CGP9", 212 | "P10922", 213 | "Q8CJI4", 214 | "E0CZ52", 215 | "E0CYL2", 216 | "Q8VIK3", 217 | "Q80ZM5", 218 | "Q9CQ70", 219 | "Q8R1M2", 220 | "Q3THW5", 221 | "Q8R029", 222 | "B2RVP5", 223 | "P27661", 224 | "Q9QZQ8", 225 | "Q8CA90", 226 | "Q8BP16", 227 | "Q9CTR1", 228 | "Q8CCK0", 229 | "Q9D3V6", 230 | "Q9D3U7", 231 | "Q3UA95", 232 | "Q3TFU6", 233 | "G3UWL7", 234 | "G3UX40", 235 | "P0C0S6", 236 | "F8WI35", 237 | "E0CZ27", 238 | "E0CYN1", 239 | "E0CYR7", 240 | "P84244", 241 | "P02301", 242 | "Q9QYL0", 243 | "P43275", 244 | "P43276", 245 | "P15864", 246 | "Q5SZA3", 247 | "P43277", 248 | "Q149Z9", 249 | "P43274", 250 | "Q07133", 251 | "I7HFT9", 252 | "Q8CGP4", 253 | "P22752", 254 | "B2RVF0", 255 | "Q61668", 256 | "Q8CGP5", 257 | "A0AUV1", 258 | "Q8CGP6", 259 | "A3KPD0", 260 | "Q8CGP7", 261 | "F8WIX8", 262 | "A0JNS9", 263 | "P70696", 264 | "Q64475", 265 | "Q6ZWY9", 266 | "P10853", 267 | "Q64478", 268 | "A0JLV3", 269 | "Q8CGP1", 270 | "B2RVD5", 271 | "P10854", 272 | "B2RTK3", 273 | "Q8CGP2", 274 | "P68433", 275 | "P84228", 276 | "A1L0U3", 277 | "A1L0V4", 278 | "P62806", 279 | "B2RWH3", 280 | "Q6GSS7", 281 | "Q64522", 282 | "Q64523", 283 | "Q149V4", 284 | "Q64525", 285 | "G3X9D5", 286 | "Q64524", 287 | "B9EI85", 288 | "Q61667", 289 | "Q8BFU2", 290 | "A2AB79", 291 | "Q9D2U9", 292 | "Q8CGP0", 293 | "Q6B822", 294 | "P07978", 295 | "Q9D9Z7" 296 | ] 297 | }, 298 | "DROME": { 299 | "name": "drome", 300 | "genome_size": 144000000, 301 | "histone_proteins": [ 302 | "Q6TXQ1", 303 | "P02255", 304 | "Q4AB54", 305 | "Q4ABE3", 306 | "Q4ABD8", 307 | "Q4AB94", 308 | "P84051", 309 | "Q4AB57", 310 | "P08985", 311 | "P02283", 312 | "P02299", 313 | "E2QCP0", 314 | "P84249", 315 | "P84040" 316 | ], 317 | "histone_entries": [ 318 | "Q9DAD9_MOUSE", 319 | "B2RTM0_MOUSE", 320 | "Q8CBB6_MOUSE", 321 | "Q921L4_MOUSE", 322 | "H2AL1_MOUSE", 323 | "Q810S6_MOUSE", 324 | "Q9DAD9_MOUSE", 325 | "Q497L1_MOUSE", 326 | "A9Z055_MOUSE", 327 | "Q8CGP9_MOUSE", 328 | "H10_MOUSE", 329 | "H1FNT_MOUSE", 330 | "E0CZ52_MOUSE", 331 | "E0CYL2_MOUSE", 332 | "H18_MOUSE", 333 | "Q80ZM5_MOUSE", 334 | "H2AB1_MOUSE", 335 | "H2AJ_MOUSE", 336 | "H2AV_MOUSE", 337 | "Q8R029_MOUSE", 338 | "B2RVP5_MOUSE", 339 | "H2AX_MOUSE", 340 | "H2AY_MOUSE", 341 | "Q8CA90_MOUSE", 342 | "Q8BP16_MOUSE", 343 | "Q9CTR1_MOUSE", 344 | "H2AW_MOUSE", 345 | "Q9D3V6_MOUSE", 346 | "Q9D3U7_MOUSE", 347 | "Q3UA95_MOUSE", 348 | "Q3TFU6_MOUSE", 349 | "G3UWL7_MOUSE", 350 | "G3UX40_MOUSE", 351 | "H2AZ_MOUSE", 352 | "F8WI35_MOUSE", 353 | "E0CZ27_MOUSE", 354 | "E0CYN1_MOUSE", 355 | "E0CYR7_MOUSE", 356 | "H33_MOUSE", 357 | "H3C_MOUSE", 358 | "HILS1_MOUSE", 359 | "H11_MOUSE", 360 | "H15_MOUSE", 361 | "H12_MOUSE", 362 | "Q5SZA3_MOUSE", 363 | "H13_MOUSE", 364 | "Q149Z9_MOUSE", 365 | "H14_MOUSE", 366 | "H1T_MOUSE", 367 | "I7HFT9_MOUSE", 368 | "Q8CGP4_MOUSE", 369 | "H2A1P_MOUSE", 370 | "H2A1B_MOUSE", 371 | "H2A1C_MOUSE", 372 | "H2A1D_MOUSE", 373 | "H2A1E_MOUSE", 374 | "H2A1G_MOUSE", 375 | "H2A1I_MOUSE", 376 | "H2A1N_MOUSE", 377 | "H2A1O_MOUSE", 378 | "B2RVF0_MOUSE", 379 | "Q61668_MOUSE", 380 | "H2A1F_MOUSE", 381 | "A0AUV1_MOUSE", 382 | "H2A1H_MOUSE", 383 | "A3KPD0_MOUSE", 384 | "H2A1K_MOUSE", 385 | "A0JNS9_MOUSE", 386 | "H2B1A_MOUSE", 387 | "H2B1B_MOUSE", 388 | "H2B1C_MOUSE", 389 | "H2B1F_MOUSE", 390 | "H2B1H_MOUSE", 391 | "A0JLV3_MOUSE", 392 | "H2B1K_MOUSE", 393 | "B2RVD5_MOUSE", 394 | "H2B1M_MOUSE", 395 | "B2RTK3_MOUSE", 396 | "H2B1P_MOUSE", 397 | "H31_MOUSE", 398 | "H32_MOUSE", 399 | "A1L0U3_MOUSE", 400 | "A1L0V4_MOUSE", 401 | "H4_MOUSE", 402 | "B2RWH3_MOUSE", 403 | "H2A2A_MOUSE", 404 | "H2A2B_MOUSE", 405 | "H2A2C_MOUSE", 406 | "Q149V4_MOUSE", 407 | "H2B2B_MOUSE", 408 | "H2B2E_MOUSE", 409 | "B9EI85_MOUSE", 410 | "Q61667_MOUSE", 411 | "H2A3_MOUSE", 412 | "A2AB79_MOUSE", 413 | "H2B3A_MOUSE", 414 | "H2B3B_MOUSE", 415 | "Q6B822_MOUSE", 416 | "PRM2_MOUSE", 417 | "H2BL1_MOUSE" 418 | ] 419 | }, 420 | "CAEEL": { 421 | "name": "caeel", 422 | "genome_size": 104000000, 423 | "histone_proteins": [ 424 | "P10771", 425 | "P15796", 426 | "Q19743", 427 | "O17536", 428 | "O01833", 429 | "Q9U3W3", 430 | "Q18336", 431 | "P09588", 432 | "J7S164", 433 | "J7SA65", 434 | "Q27485", 435 | "Q23429", 436 | "Q27511", 437 | "P04255", 438 | "Q27894", 439 | "P08898", 440 | "K7ZUH9", 441 | "Q10453", 442 | "Q9U281", 443 | "Q27490", 444 | "Q27532", 445 | "P62784", 446 | "Q27484", 447 | "Q27876", 448 | "O16277", 449 | "Q27489" 450 | ], 451 | "histone_entries": [ 452 | "H24_CAEEL", 453 | "H12_CAEEL", 454 | "H13_CAEEL", 455 | "H14_CAEEL", 456 | "H15_CAEEL", 457 | "Q9U3W3_CAEEL", 458 | "H1X_CAEEL", 459 | "H2A_CAEEL", 460 | "J7S164_CAEEL", 461 | "J7SA65_CAEEL", 462 | "Q27485_CAEEL", 463 | "Q23429_CAEEL", 464 | "H2AV_CAEEL", 465 | "H2B1_CAEEL", 466 | "H2B2_CAEEL", 467 | "H3_CAEEL", 468 | "K7ZUH9_CAEEL", 469 | "H331_CAEEL", 470 | "H332_CAEEL", 471 | "H33L1_CAEEL", 472 | "H33L2_CAEEL", 473 | "H4_CAEEL", 474 | "H2B3_CAEEL", 475 | "H2B4_CAEEL", 476 | "H16_CAEEL", 477 | "H33L3_CAEEL" 478 | ] 479 | }, 480 | "YEAST": { 481 | "name": "yeast", 482 | "genome_size": 12100000, 483 | "histone_proteins": [ 484 | "P53551", 485 | "P04911", 486 | "P04912", 487 | "Q12692", 488 | "P02293", 489 | "P02294", 490 | "P61830", 491 | "P02309" 492 | ], 493 | "histone_entries": [ 494 | "H1_YEAST", 495 | "H2A1_YEAST", 496 | "H2A2_YEAST", 497 | "H2AZ_YEAST", 498 | "H2B1_YEAST", 499 | "H2B2_YEAST", 500 | "H3_YEAST", 501 | "H4_YEAST" 502 | ] 503 | }, 504 | "SCHPO": { 505 | "name": "schpo", 506 | "genome_size": 14100000, 507 | "histone_proteins": [ 508 | "P48003", 509 | "P04909", 510 | "P04910", 511 | "P04913", 512 | "P09988", 513 | "P10651", 514 | "P09322" 515 | ], 516 | "histone_entries": [ 517 | "H2AZ_SCHPO", 518 | "H2A1_SCHPO", 519 | "H2A2_SCHPO", 520 | "H2B1_SCHPO", 521 | "H31_SCHPO", 522 | "H33_SCHPO", 523 | "H4_SCHPO" 524 | ] 525 | } 526 | } 527 | -------------------------------------------------------------------------------- /ibaqpy/ibaq/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/ibaqpy/ibaq/__init__.py -------------------------------------------------------------------------------- /ibaqpy/ibaq/combiner.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from ibaqpy.ibaq.ibaqpy_commons import load_feature, load_sdrf 9 | from ibaqpy.ibaq.imputation_methods import impute_missing_values 10 | from ibaqpy.ibaq.utils import ( 11 | compute_pca, 12 | get_batch_info_from_sample_names, 13 | generate_meta, 14 | folder_retrieval, 15 | filter_missing_value_by_group, 16 | split_df_by_column, 17 | fill_samples, 18 | iterative_outlier_removal, 19 | plot_pca, 20 | remove_single_sample_batches, 21 | apply_batch_correction, 22 | ) 23 | 24 | logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG) 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | class Combiner: 29 | def __init__(self, data_folder: os.PathLike, covariate: str = None, organism: str = "HUMAN"): 30 | """ 31 | Initialize a Combiner instance to process and combine SDRF and iBAQ data. 32 | 33 | Parameters: 34 | data_folder : os.PathLike 35 | Path to the folder containing SDRF and iBAQ files. 36 | covariate : str, optional 37 | Covariate to be used in data processing, default is None. 38 | organism : str, optional 39 | Organism filter for protein names, default is "HUMAN". 40 | 41 | Raises: 42 | FileNotFoundError 43 | If the specified data folder does not exist or is not a directory. 44 | 45 | Notes 46 | ----- 47 | This method initializes various attributes for data processing, retrieves 48 | SDRF and iBAQ files from the specified folder, and filters protein data 49 | by the specified organism. 50 | """ 51 | self.df_pca = compute_pca(self.df_corrected.T, n_components=5) 52 | self.df_corrected = None 53 | self.batch_index = get_batch_info_from_sample_names(self.df.columns.tolist()) 54 | self.df_pca = None 55 | self.df_filtered_outliers = None 56 | self.batch_index = get_batch_info_from_sample_names(self.df.columns) 57 | self.samples_number = None 58 | self.datasets = None 59 | self.samples = self.df.columns.tolist() 60 | self.proteins = self.df["ProteinName"].unique().tolist() 61 | logger.info("Combining SDRFs and ibaq results ...") 62 | self.data_folder = Path(data_folder) 63 | if not self.data_folder.exists() or not self.data_folder.is_dir(): 64 | raise FileNotFoundError(f"Data folder {self.data_folder} does not exsit!") 65 | self.covariate = covariate 66 | files = folder_retrieval(str(self.data_folder)) 67 | self.metadata, self.df = pd.DataFrame(), pd.DataFrame() 68 | for sdrf in files["sdrf"]: 69 | sdrf_df = load_sdrf(sdrf) 70 | self.metadata = pd.concat([self.metadata, generate_meta(sdrf_df)]) 71 | self.metadata = self.metadata.drop_duplicates() 72 | self.metadata.index = self.metadata["sample_id"] 73 | 74 | for ibaq in files["ibaq"]: 75 | self.df = pd.concat([self.df, load_feature(ibaq)]) 76 | self.df = self.df[self.df["ProteinName"].str.endswith(organism)] 77 | self.df.index = self.df["SampleID"] 78 | self.df = self.df.join(self.metadata, how="left") 79 | logger.info(self.metadata, self.df.head) 80 | 81 | def read_data(self, meta: str, ibaq: str, organism="HUMAN", covariate=None): 82 | """ 83 | Reads and processes iBAQ and metadata files, filtering protein data by organism. 84 | 85 | Parameters: 86 | meta : str 87 | Path to the metadata CSV file. 88 | ibaq : str 89 | Path to the iBAQ CSV file. 90 | organism : str, optional 91 | Organism filter for protein names, default is "HUMAN". 92 | covariate : str, optional 93 | Covariate to be used in data processing, default is None. 94 | 95 | Notes 96 | ----- 97 | The method updates the instance's dataframe and metadata attributes by 98 | reading the specified files, filtering proteins by the given organism, 99 | and joining metadata to the iBAQ data. 100 | """ 101 | 102 | self.covariate = covariate 103 | self.df = pd.read_csv(ibaq, index_col=0) 104 | self.metadata = pd.read_csv(meta) 105 | self.df = self.df[self.df["ProteinName"].str.endswith(organism)] 106 | self.df.index = self.df["SampleID"] 107 | self.metadata = self.metadata.drop_duplicates() 108 | self.df = self.df.join(self.metadata, how="left") 109 | 110 | def imputer(self, covariate_to_keep: list = None): 111 | """ 112 | Impute missing values in the combined iBAQ results DataFrame. 113 | 114 | This method processes the DataFrame by filtering, filling, and imputing 115 | missing values based on specified covariates. It ensures that only columns 116 | with a sufficient percentage of non-missing values are retained and performs 117 | imputation using KNN or other specified methods. 118 | 119 | Parameters: 120 | covariate_to_keep : list, optional 121 | A list of covariate values to retain in the DataFrame. Only rows with 122 | these covariate values will be kept. 123 | 124 | Raises: 125 | SystemExit 126 | If the specified covariate contains fewer than two unique values. 127 | 128 | Notes 129 | ----- 130 | - The method modifies the instance's DataFrame by imputing missing values 131 | and potentially altering its structure. 132 | - The imputation process requires samples as columns and proteins as rows. 133 | """ 134 | logger.info("Imputing merged ibaq results ...") 135 | # Keep only columns 'sample_id' and covariate from df_metadata 136 | if self.covariate: 137 | if len(self.metadata[self.covariate].unique()) < 2: 138 | raise SystemExit( 139 | f"{self.covariate} should contain at least two different covariates!" 140 | ) 141 | 142 | # Keep only rows within covariate_to_keep, you can keep tissue or tissue part you want. 143 | if covariate_to_keep: 144 | self.df = self.df[self.df[self.covariate].isin(covariate_to_keep)] 145 | 146 | # keep columns with at least 30% of non-missing values in each covariate_index group 147 | self.df = filter_missing_value_by_group( 148 | self.df, col="ProteinName", non_missing_percent_to_keep=0.3 149 | ) 150 | 151 | # TODO: Data for imputation should take samples as columns, proteins as rows. [Expression Matrix] 152 | # Also need to fill the proteins didn't show in original results for each sample. 153 | if self.covariate: 154 | # split df by covariates 155 | df_list = split_df_by_column(self.df, cov_index_col=self.covariate) 156 | df_list = [fill_samples(df, self.proteins) for df in df_list] 157 | 158 | # impute missing values with KNNImputer for every df in df_list 159 | df_list = impute_missing_values(df_list) 160 | 161 | # concatenate all dataframes in df_list into one dataframe 162 | self.df = pd.concat(df_list, axis=1) 163 | else: 164 | self.df = fill_samples(self.df, self.proteins) 165 | self.df = impute_missing_values(self.df) 166 | 167 | self.datasets = list(set([sample.split("-")[0] for sample in self.samples])) 168 | logger.info(f"DataFrame head after imputation:\n{self.df.head()}") 169 | 170 | def outlier_removal( 171 | self, 172 | n_components: int = None, 173 | min_cluster_size: int = None, 174 | min_samples_num: int = None, 175 | n_iter: int = None, 176 | ): 177 | """ 178 | Remove outliers from the imputed data using an iterative approach and plot the PCA results. 179 | 180 | This method applies iterative outlier removal on the imputed data, updates the filtered 181 | DataFrame, and generates a PCA plot of the corrected data with outliers removed. 182 | 183 | Parameters: 184 | n_components : int, optional 185 | Number of principal components to compute. Defaults to a third of the unique batch indices. 186 | min_cluster_size : int, optional 187 | Minimum size of clusters for outlier detection. Defaults to the median number of samples per batch. 188 | min_samples_num : int, optional 189 | Minimum number of samples in a neighborhood for a point to be considered a core point. 190 | Defaults to the median number of samples per batch. 191 | n_iter : int, optional 192 | Number of iterations for outlier removal. Defaults to 5. 193 | 194 | Notes 195 | ----- 196 | - The method modifies the instance's DataFrame by removing outliers. 197 | - A PCA plot is saved as 'pca_corrected_outliers_removed.png'. 198 | """ 199 | logger.info("Removing outliers from imputed data ...") 200 | # Apply iterative outlier removal on imputed data 201 | # get batch indices from the columns names 202 | batches = [sample.split("-")[0] for sample in self.samples] 203 | self.samples_number = {dataset: batches.count(dataset) for dataset in self.datasets} 204 | min_samples = round(np.median(list(self.samples_number.values()))) 205 | if min_samples == 1: 206 | min_samples = 2 207 | # apply iterative outlier removal 208 | self.df_filtered_outliers = iterative_outlier_removal( 209 | self.df, 210 | self.batch_index, 211 | n_components=(n_components if n_components else round(len(set(self.batch_index)) / 3)), 212 | min_cluster_size=min_cluster_size if min_cluster_size else min_samples, 213 | min_samples=min_samples_num if min_samples_num else min_samples, 214 | n_iter=n_iter if n_iter else 5, 215 | ) 216 | logger.info(self.df_filtered_outliers) 217 | # plot PCA of corrected data with outliers removed 218 | # transpose the dataframe to get samples as rows and features as columns 219 | self.df_pca = compute_pca( 220 | self.df_filtered_outliers.T, 221 | n_components=(n_components if n_components else round(len(set(self.batch_index)) / 3)), 222 | ) 223 | 224 | # add batch information to the dataframe 225 | self.df_pca["batch"] = self.df_pca.index.str.split("-").str[0] 226 | 227 | # plot PC1 vs PC2 with batch information using seaborn 228 | # put the legend outside the plot 229 | # save the plot as a png file 230 | plot_pca( 231 | self.df_pca, 232 | title="PCA plot of corrected data with outliers removed", 233 | output_file="pca_corrected_outliers_removed.png", 234 | ) 235 | 236 | def batch_correction(self, n_components: int = None, tissue_parts_to_keep: int = None): 237 | """ 238 | Apply batch effect correction to the data and plot PCA results. 239 | 240 | This method performs batch correction on the data using specified covariates 241 | and plots PCA before and after correction. It filters out batches with only 242 | one sample and optionally retains specific tissue parts. 243 | 244 | Parameters: 245 | n_components : int, optional 246 | Number of principal components to compute. Defaults to a third of the unique batch indices. 247 | tissue_parts_to_keep : int, optional 248 | Number of tissue parts to retain in the data. 249 | 250 | Notes 251 | ----- 252 | - The method modifies the instance's DataFrame by applying batch correction. 253 | - PCA plots are saved as 'pca_uncorrected.png' and 'pca_corrected.png'. 254 | """ 255 | logger.info("Applying batch effect correction ...") 256 | # Plot PCA of uncorrected imputed data 257 | # transpose the dataframe to get samples as rows and features as columns 258 | self.df_pca = compute_pca( 259 | self.df.T, 260 | n_components=(n_components if n_components else round(len(set(self.batch_index)) / 3)), 261 | ) 262 | 263 | # add batch information to the dataframe 264 | self.df_pca["batch"] = self.df_pca.index.str.split("-").str[0] 265 | 266 | # plot PC1 vs PC2 with batch information using seaborn 267 | # put the legend outside the plot 268 | # save the plot as a png file 269 | plot_pca( 270 | self.df_pca, 271 | title="PCA plot of uncorrected data", 272 | output_file="pca_uncorrected.png", 273 | ) 274 | 275 | # keep samples only in tissue_part from metadata 276 | # TODO: specify covariates 277 | if tissue_parts_to_keep: 278 | self.metadata = self.metadata[self.metadata["tissue_part"].isin(tissue_parts_to_keep)] 279 | samples_to_keep = self.metadata["sample_id"].tolist() 280 | 281 | # keep samples in df that are also in samples_to_keep 282 | self.df = self.df[[s for s in self.df.columns if s in samples_to_keep]] 283 | 284 | # 2. Apply batch correction with covariate information 285 | # Before apply batch correction, filter out batches with just one sample (otherwise the batch correction will fail). 286 | batch_index = get_batch_info_from_sample_names(self.df.columns.tolist()) 287 | self.df = remove_single_sample_batches(self.df, batch_index) 288 | 289 | # get covariate information from metadata. 290 | columns = self.df.columns.tolist() 291 | self.metadata = self.metadata[self.metadata["sample_id"].isin(columns)] 292 | # reorder metadata to match the order of columns in df 293 | self.metadata = self.metadata.reset_index(drop=True) 294 | self.metadata = self.metadata.set_index("sample_id").reindex(columns, axis=0).reset_index() 295 | if self.covariate: 296 | # get the covariates from metadata as a list 297 | covariates_index = self.metadata[self.covariate].tolist() 298 | else: 299 | covariates_index = [] 300 | 301 | # apply batch correction 302 | self.df_corrected = apply_batch_correction( 303 | self.df, self.batch_index, covs=covariates_index 304 | ) 305 | logger.info(self.df_corrected) 306 | 307 | # plot PCA of corrected data 308 | # transpose the dataframe to get samples as rows and features as columns 309 | # add batch information to the dataframe 310 | self.df_pca["batch"] = self.df_pca.index.str.split("-").str[0] 311 | 312 | # plot PC1 vs PC2 with batch information using seaborn 313 | # put the legend outside the plot 314 | # save the plot as a png file 315 | plot_pca( 316 | self.df_pca, 317 | title="PCA plot of corrected data", 318 | output_file="pca_corrected.png", 319 | ) 320 | -------------------------------------------------------------------------------- /ibaqpy/ibaq/file_utils.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import logging 3 | import warnings 4 | from typing import List, Optional, TYPE_CHECKING 5 | 6 | import pandas as pd 7 | 8 | from ibaqpy.ibaq.ibaqpy_postprocessing import pivot_wider 9 | 10 | 11 | if TYPE_CHECKING: 12 | import anndata as an 13 | 14 | logger = logging.getLogger(__name__) 15 | logger.addHandler(logging.NullHandler()) 16 | 17 | 18 | def create_anndata( 19 | df: pd.DataFrame, 20 | obs_col: str, 21 | var_col: str, 22 | value_col: str, 23 | layer_cols: Optional[List[str]] = None, 24 | obs_metadata_cols: Optional[List[str]] = None, 25 | var_metadata_cols: Optional[List[str]] = None, 26 | ) -> "an.AnnData": 27 | """ 28 | Create an AnnData object from a long-format DataFrame. 29 | 30 | Parameters: 31 | df (pd.DataFrame): Input data in long format. 32 | obs_col (str): Column name in df representing observation IDs. 33 | var_col (str): Column name in df representing variable IDs. 34 | value_col (str): Column name in df representing the main data values. 35 | layer_cols (Optional[List[str]]): List of column names in df to add as additional layers. 36 | obs_metadata_cols (Optional[List[str]]): List of column names in df to add as observation metadata. 37 | var_metadata_cols (Optional[List[str]]): List of column names in df to add as variable metadata. 38 | 39 | Returns: 40 | anndata.AnnData: The constructed AnnData object. 41 | """ 42 | 43 | import anndata as an 44 | 45 | if df.empty: 46 | raise ValueError("Cannot create AnnData object from empty DataFrame") 47 | # Validate that the required columns exist in the DataFrame. 48 | required_cols = [obs_col, var_col, value_col] 49 | missing = [col for col in required_cols if col not in df.columns] 50 | if missing: 51 | raise ValueError( 52 | f"The following required columns are missing from the input DataFrame: {missing}" 53 | ) 54 | 55 | # Pivot the long dataframe to create a wide-format matrix for the main values. 56 | df_matrix = pivot_wider(df, row_name=obs_col, col_name=var_col, values=value_col, fillna=True) 57 | if df_matrix.empty: 58 | raise ValueError("Pivot operation resulted in an empty DataFrame") 59 | if df_matrix.shape[0] == 0 or df_matrix.shape[1] == 0: 60 | raise ValueError("Pivot operation resulted in a DataFrame with zero dimensions") 61 | 62 | # Create the AnnData object with the main data matrix. 63 | adata = an.AnnData( 64 | X=df_matrix.to_numpy(), 65 | obs=df_matrix.index.to_frame(), 66 | var=df_matrix.columns.to_frame(), 67 | ) 68 | 69 | def add_metadata(metadata_df: pd.DataFrame, key: str, cols: List[str]) -> pd.DataFrame: 70 | """ 71 | Add metadata columns to a DataFrame by mapping values from the original long dataframe. 72 | 73 | Parameters: 74 | metadata_df (pd.DataFrame): DataFrame (either adata.obs or adata.var) to update. 75 | key (str): The column name used as key (obs_col for observations, var_col for variables). 76 | cols (List[str]): List of metadata columns to add. 77 | 78 | Returns: 79 | pd.DataFrame: The updated metadata DataFrame. 80 | """ 81 | for col in cols: 82 | if col not in df.columns: 83 | warnings.warn( 84 | f"Column '{col}' not found in the input DataFrame. Skipping metadata for '{col}'." 85 | ) 86 | continue 87 | # Create a mapping from key to metadata values. 88 | mapping = df[[key, col]].drop_duplicates().set_index(key)[col] 89 | metadata_df[col] = metadata_df.index.map(mapping) 90 | return metadata_df 91 | 92 | # Add observation metadata, if provided. 93 | if obs_metadata_cols: 94 | adata.obs = add_metadata(adata.obs, obs_col, obs_metadata_cols) 95 | 96 | # Add variable metadata, if provided. 97 | if var_metadata_cols: 98 | adata.var = add_metadata(adata.var, var_col, var_metadata_cols) 99 | 100 | # Add additional layers (if any) using a similar pivot operation. 101 | if layer_cols: 102 | for layer_col in layer_cols: 103 | if layer_col not in df.columns: 104 | warnings.warn( 105 | f"Layer column '{layer_col}' not found in the input DataFrame. Skipping layer '{layer_col}'." 106 | ) 107 | continue 108 | df_layer = pivot_wider( 109 | df, row_name=obs_col, col_name=var_col, values=layer_col, fillna=True 110 | ) 111 | adata.layers[layer_col] = df_layer.to_numpy() 112 | 113 | logger.info(f"Created AnnData object:\n {adata}") 114 | 115 | return adata 116 | 117 | 118 | def combine_ibaq_tsv_files( 119 | dir_path: str, pattern: str = "*", comment: str = "#", sep: str = "\t" 120 | ) -> pd.DataFrame: 121 | """ 122 | Combine multiple TSV files from a directory into a single pandas DataFrame. 123 | 124 | Parameters: 125 | dir_path : str 126 | Directory path containing the TSV files. 127 | pattern : str, optional 128 | Pattern to match files in the directory (default is '*'). 129 | comment : str, optional 130 | Character to indicate the start of a comment line (default is '#'). 131 | It will skip lines starting with this character when reading the TSV files. 132 | sep : str, optional 133 | Delimiter to use for reading the TSV files (default is '\t'). 134 | 135 | Returns: 136 | Optional[pd.DataFrame] 137 | Combined DataFrame containing data from all TSV files, or None if no files match the pattern. 138 | 139 | Examples 140 | -------- 141 | dir_path = './ibaqpy-research-data/ibaq-hela-raw' 142 | combined_df = combine_ibaq_tsv_files(dir_path, pattern='*ibaq.tsv', comment='#', sep='\t') 143 | """ 144 | file_paths = glob.glob(f"{dir_path}/{pattern}") 145 | 146 | if not file_paths: 147 | raise FileNotFoundError( 148 | f"No files found in the directory '{dir_path}' matching the pattern '{pattern}'." 149 | ) 150 | 151 | dataframes = [] 152 | 153 | first_schema = None 154 | for file_path in file_paths: 155 | try: 156 | # Read the TSV file, skipping lines that start with the comment character 157 | df = pd.read_csv(file_path, sep=sep, comment=comment) 158 | 159 | # Validate schema consistency 160 | if first_schema is None: 161 | first_schema = set(df.columns) 162 | elif set(df.columns) != first_schema: 163 | raise ValueError( 164 | f"Schema mismatch in file '{file_path}'. " 165 | f"Expected columns: {sorted(first_schema)}, " 166 | f"got: {sorted(df.columns)}" 167 | ) 168 | 169 | dataframes.append(df) 170 | except Exception as e: 171 | raise ValueError(f"Error reading file '{file_path}': {str(e)}") 172 | 173 | # Concatenate all DataFrames 174 | combined_df = pd.concat(dataframes, ignore_index=True) 175 | 176 | return combined_df 177 | -------------------------------------------------------------------------------- /ibaqpy/ibaq/ibaqpy_commons.py: -------------------------------------------------------------------------------- 1 | import os 2 | import matplotlib 3 | import numpy as np 4 | import pandas as pd 5 | import seaborn as sns 6 | 7 | from matplotlib import pyplot as plt 8 | from matplotlib.figure import Figure 9 | 10 | 11 | PROTEIN_NAME = "ProteinName" 12 | PEPTIDE_SEQUENCE = "PeptideSequence" 13 | PEPTIDE_CANONICAL = "PeptideCanonical" 14 | PEPTIDE_CHARGE = "PrecursorCharge" 15 | CHANNEL = "Channel" 16 | MIXTRUE = "Mixture" 17 | TECHREPMIXTURE = "TechRepMixture" 18 | CONDITION = "Condition" 19 | BIOREPLICATE = "BioReplicate" 20 | TECHREPLICATE = "TechReplicate" 21 | RUN = "Run" 22 | FRACTION = "Fraction" 23 | INTENSITY = "Intensity" 24 | NORM_INTENSITY = "NormIntensity" 25 | REFERENCE = "Reference" 26 | SAMPLE_ID = "SampleID" 27 | SAMPLE_ID_REGEX = r"^[A-Za-z0-9]+(?:-[A-Za-z0-9]+)*$" 28 | SEARCH_ENGINE = "searchScore" 29 | SCAN = "Scan" 30 | MBR = "MatchBetweenRuns" 31 | IBAQ = "Ibaq" 32 | IBAQ_NORMALIZED = "IbaqNorm" 33 | IBAQ_LOG = "IbaqLog" 34 | IBAQ_BEC = "IbaqBec" 35 | IBAQ_PPB = "IbaqPpb" 36 | TPA = "TPA" 37 | MOLECULARWEIGHT = "MolecularWeight" 38 | COPYNUMBER = "CopyNumber" 39 | CONCENTRATION_NM = "Concentration[nM]" 40 | WEIGHT_NG = "Weight[ng]" 41 | MOLES_NMOL = "Moles[nmol]" 42 | GLOBALMEDIAN = "globalMedian" 43 | CONDITIONMEDIAN = "conditionMedian" 44 | 45 | 46 | PARQUET_COLUMNS = [ 47 | "pg_accessions", 48 | "peptidoform", 49 | "sequence", 50 | "precursor_charge", 51 | "channel", 52 | "condition", 53 | "biological_replicate", 54 | "run", 55 | "fraction", 56 | "intensity", 57 | "reference_file_name", 58 | "sample_accession", 59 | ] 60 | 61 | 62 | parquet_map = { 63 | "pg_accessions": PROTEIN_NAME, 64 | "peptidoform": PEPTIDE_SEQUENCE, 65 | "sequence": PEPTIDE_CANONICAL, 66 | "precursor_charge": PEPTIDE_CHARGE, 67 | "channel": CHANNEL, 68 | "condition": CONDITION, 69 | "biological_replicate": BIOREPLICATE, 70 | "run": RUN, 71 | "fraction": FRACTION, 72 | "intensity": INTENSITY, 73 | "reference_file_name": REFERENCE, 74 | "sample_accession": SAMPLE_ID, 75 | } 76 | 77 | 78 | def get_accession(identifier: str) -> str: 79 | """ 80 | Get protein accession from the identifier (e.g. sp|P12345|PROT_NAME) 81 | :param identifier: Protein identifier 82 | :return: Protein accession 83 | """ 84 | identifier_lst = identifier.split("|") 85 | if len(identifier_lst) == 1: 86 | return identifier_lst[0] 87 | else: 88 | return identifier_lst[1] 89 | 90 | 91 | def plot_distributions( 92 | dataset: pd.DataFrame, 93 | field: str, 94 | class_field: str, 95 | title: str = "", 96 | log2: bool = True, 97 | width: float = 10, 98 | ) -> Figure: 99 | """ 100 | Print the quantile plot for the dataset 101 | :param dataset: DataFrame 102 | :param field: Field that would be use in the dataframe to plot the quantile 103 | :param class_field: Field to group the quantile into classes 104 | :param title: Title of the box plot 105 | :param log2: Log the intensity values 106 | :param width: size of the plot 107 | :return: 108 | """ 109 | pd.set_option("mode.chained_assignment", None) 110 | normalize = dataset[[field, class_field]].reset_index(drop=True) 111 | if log2: 112 | normalize[field] = np.log2(normalize[field]) 113 | normalize.dropna(subset=[field], inplace=True) 114 | plt.figure(dpi=500, figsize=(width, 8)) 115 | fig = sns.kdeplot(data=normalize, x=field, hue=class_field, palette="Paired", linewidth=2) 116 | sns.despine(ax=fig, top=True, right=True) 117 | plt.title(title) 118 | pd.set_option("mode.chained_assignment", "warn") 119 | 120 | return plt.gcf() 121 | 122 | 123 | def plot_box_plot( 124 | dataset: pd.DataFrame, 125 | field: str, 126 | class_field: str, 127 | log2: bool = False, 128 | width: float = 10, 129 | rotation: int = 30, 130 | title: str = "", 131 | violin: bool = False, 132 | ) -> Figure: 133 | """ 134 | Plot a box plot of two values field and classes field 135 | :param violin: Also add violin on top of box plot 136 | :param dataset: Dataframe with peptide intensities 137 | :param field: Intensity field 138 | :param class_field: class to group the peptides 139 | :param log2: transform peptide intensities to log scale 140 | :param width: size of the plot 141 | :param rotation: rotation of the x-axis 142 | :param title: Title of the box plot 143 | :return: 144 | """ 145 | pd.set_option("mode.chained_assignment", None) 146 | normalized = dataset[[field, class_field]] 147 | np.seterr(divide="ignore") 148 | plt.figure(figsize=(width, 14)) 149 | if log2: 150 | normalized[field] = np.log2(normalized[field]) 151 | 152 | if violin: 153 | chart = sns.violinplot( 154 | x=class_field, 155 | y=field, 156 | data=normalized, 157 | boxprops=dict(alpha=0.3), 158 | palette="muted", 159 | ) 160 | else: 161 | chart = sns.boxplot( 162 | x=class_field, 163 | y=field, 164 | data=normalized, 165 | boxprops=dict(alpha=0.3), 166 | palette="muted", 167 | ) 168 | 169 | chart.set(title=title) 170 | chart.set_xticklabels(chart.get_xticklabels(), rotation=rotation, ha="right") 171 | pd.set_option("mode.chained_assignment", "warn") 172 | 173 | return plt.gcf() 174 | 175 | 176 | # Functions needed by Combiner 177 | def load_sdrf(sdrf_path: str) -> pd.DataFrame: 178 | """ 179 | Load SDRF TSV as a dataframe. 180 | 181 | Parameters 182 | ---------- 183 | sdrf_path : str 184 | Path to SDRF TSV. 185 | 186 | Returns 187 | ------- 188 | pd.DataFrame 189 | """ 190 | if not os.path.exists(sdrf_path): 191 | raise FileNotFoundError(f"{sdrf_path} does not exist!") 192 | sdrf_df = pd.read_csv(sdrf_path, sep="\t") 193 | sdrf_df.columns = [col.lower() for col in sdrf_df.columns] 194 | return sdrf_df 195 | 196 | 197 | def load_feature(feature_path: str) -> pd.DataFrame: 198 | """ 199 | Load feature file as a dataframe. 200 | 201 | Parameters 202 | ---------- 203 | feature_path : str 204 | Path to feature file. 205 | 206 | Returns 207 | ------- 208 | pd.DataFrame 209 | 210 | Raises 211 | ------ 212 | ValueError 213 | If the provided file's suffix is not supported, either "parquet" or "csv 214 | 215 | """ 216 | suffix = os.path.splitext(feature_path)[1][1:] 217 | if suffix == "parquet": 218 | return pd.read_parquet(feature_path) 219 | elif suffix == "csv": 220 | return pd.read_csv(feature_path) 221 | else: 222 | raise ValueError( 223 | f"{suffix} is not allowed as input, please provide msstats_in or feature parquet." 224 | ) 225 | 226 | 227 | def is_parquet(path: str) -> bool: 228 | """ 229 | Check if a file is in Parquet format. 230 | 231 | This function attempts to open the specified file and read its header 232 | to determine if it matches the Parquet file signature. 233 | 234 | Parameters 235 | ---------- 236 | path : str 237 | The file path to check. 238 | 239 | Returns 240 | ------- 241 | bool 242 | True if the file is a Parquet file, False otherwise. 243 | """ 244 | try: 245 | with open(path, "rb") as fh: 246 | header = fh.read(4) 247 | return header == b"PAR1" 248 | except IOError: 249 | return False 250 | -------------------------------------------------------------------------------- /ibaqpy/ibaq/ibaqpy_postprocessing.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Union 3 | 4 | import pandas as pd 5 | 6 | from ibaqpy.ibaq.ibaqpy_commons import ( 7 | IBAQ, 8 | IBAQ_NORMALIZED, 9 | IBAQ_PPB, 10 | IBAQ_LOG, 11 | TPA, 12 | COPYNUMBER, 13 | PROTEIN_NAME, 14 | SAMPLE_ID, 15 | ) 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | logger.addHandler(logging.NullHandler()) 20 | 21 | 22 | def remove_samples_low_protein_number(ibaq_df: pd.DataFrame, min_protein_num: int) -> pd.DataFrame: 23 | """ 24 | Remove samples with a low number of unique proteins from the DataFrame. 25 | 26 | This function filters out samples from the given DataFrame that have fewer 27 | unique proteins than the specified minimum threshold. 28 | 29 | Parameters 30 | ---------- 31 | ibaq_df : pd.DataFrame 32 | The input DataFrame containing iBAQ data. 33 | min_protein_num : int 34 | The minimum number of unique proteins required to keep a sample. 35 | 36 | Returns 37 | ------- 38 | pd.DataFrame 39 | A filtered DataFrame containing only samples with at least the specified number of unique proteins. 40 | """ 41 | 42 | protein_num = ibaq_df.groupby(SAMPLE_ID)[PROTEIN_NAME].nunique() 43 | 44 | # Get the samples with more than min_protein_num proteins 45 | samples_to_keep = protein_num[protein_num >= min_protein_num].index 46 | samples_to_remove = protein_num[protein_num < min_protein_num].index 47 | 48 | logger.info( 49 | "The number of samples with number of proteins lower than {} is {}".format( 50 | min_protein_num, len(samples_to_remove) 51 | ) 52 | ) 53 | 54 | # Filter the samples 55 | ibaq_df = ibaq_df[ibaq_df["SampleID"].isin(samples_to_keep)] 56 | return ibaq_df 57 | 58 | 59 | def remove_missing_values( 60 | ibaq_df: pd.DataFrame, 61 | missingness_percentage: float = 30, 62 | expression_column: str = IBAQ, 63 | ) -> pd.DataFrame: 64 | """ 65 | Remove samples from the DataFrame based on missing values in the expression column. 66 | 67 | This function filters out samples from the input DataFrame where the percentage 68 | of missing values in the specified expression column exceeds the given threshold. 69 | 70 | Parameters 71 | ---------- 72 | ibaq_df : pd.DataFrame 73 | The input DataFrame containing iBAQ data. 74 | missingness_percentage : float 75 | The threshold percentage of missing values allowed per sample. 76 | expression_column : str 77 | The column name in the DataFrame representing expression values. 78 | 79 | Returns 80 | ------- 81 | pd.DataFrame 82 | A DataFrame with samples filtered based on the missingness criteria. 83 | 84 | Raises 85 | ------ 86 | ValueError 87 | If the input is not a DataFrame or if the expression column is not present. 88 | """ 89 | 90 | # Ensure the input is a DataFrame 91 | if not isinstance(ibaq_df, pd.DataFrame): 92 | raise ValueError("The input ibaq_df must be a pandas DataFrame.") 93 | 94 | if expression_column not in ibaq_df.columns: 95 | raise ValueError(f"The expression column '{expression_column}' is not in the DataFrame.") 96 | 97 | # Initial number of samples 98 | initial_sample_count = ibaq_df["SampleID"].nunique() 99 | logger.info(f"Initial number of samples: {initial_sample_count}") 100 | 101 | # Create a pivot table to organize data by ProteinName and SampleID 102 | pivot_df = ibaq_df.pivot_table(index=PROTEIN_NAME, columns=SAMPLE_ID, values=expression_column) 103 | 104 | # Remove samples where all proteins have missing values 105 | non_missing_samples = pivot_df.columns[pivot_df.notna().any(axis=0)] 106 | 107 | # Compute missingness percentage per sample 108 | missingness = pivot_df[non_missing_samples].isna().sum() / len(pivot_df) * 100 109 | 110 | # Filter samples based on the missingness percentage threshold 111 | valid_samples = missingness[missingness <= missingness_percentage].index 112 | 113 | # Filter the original DataFrame for valid samples 114 | filtered_df = ibaq_df[ibaq_df[SAMPLE_ID].isin(valid_samples)] 115 | 116 | # Final number of samples 117 | final_sample_count = filtered_df[SAMPLE_ID].nunique() 118 | logger.info(f"Final number of samples: {final_sample_count}") 119 | 120 | removed_sample_count = initial_sample_count - final_sample_count 121 | logger.info(f"Number of samples removed: {removed_sample_count}") 122 | 123 | return filtered_df 124 | 125 | 126 | def describe_expression_metrics(ibaq_df: pd.DataFrame) -> pd.DataFrame: 127 | """ 128 | Generate descriptive statistics for expression metrics in an iBAQ DataFrame. 129 | 130 | This function calculates descriptive statistics for specific expression 131 | metrics within the provided iBAQ DataFrame, grouped by sample ID. 132 | 133 | Parameters 134 | ---------- 135 | ibaq_df : pd.DataFrame 136 | The DataFrame containing iBAQ expression data. 137 | 138 | Returns: 139 | pd.DataFrame: 140 | A DataFrame with descriptive statistics for the expression metrics, grouped by 141 | sample ID. 142 | """ 143 | 144 | possible_expression_values = [ 145 | IBAQ, 146 | IBAQ_NORMALIZED, 147 | IBAQ_LOG, 148 | IBAQ_PPB, 149 | TPA, 150 | COPYNUMBER, 151 | ] 152 | 153 | # Define the expression columns 154 | expression_columns = [col for col in ibaq_df.columns if col in possible_expression_values] 155 | 156 | # Get the metrics 157 | metrics = ibaq_df.groupby(SAMPLE_ID)[expression_columns].describe() 158 | return metrics 159 | 160 | 161 | def pivot_wider( 162 | df: pd.DataFrame, 163 | row_name: str, 164 | col_name: str, 165 | values: str, 166 | fillna: Union[int, float, bool] = False, 167 | ) -> pd.DataFrame: 168 | """ 169 | Create a matrix from a DataFrame given the row, column, and value columns. 170 | 171 | Parameters 172 | ---------- 173 | df : pd.DataFrame 174 | The input DataFrame in long format. 175 | row_name : str 176 | The column name to use as row labels (e.g., sample_ids). 177 | col_name : str 178 | The column name to use as column labels (e.g., protein_names). 179 | values : str 180 | The column name to use as cell values (e.g., expression_values). 181 | fillna : Optional[Union[bool, int, float]] 182 | Value to fill NaN. If True, fill NaN with 0. If False or None, leave NaN as is. 183 | If a number is provided, use that value. 184 | 185 | Returns 186 | ------- 187 | pd.DataFrame 188 | A pivot table (matrix) with specified rows, columns, and values. 189 | 190 | Examples 191 | -------- 192 | >>> df_matrix = pivot_wider(combined_df, 193 | row_name='SampleID', 194 | col_name='ProteinName', 195 | values='Ibaq', 196 | fillna=False) 197 | """ 198 | # Check if the provided columns exist in the DataFrame 199 | missing_columns = {row_name, col_name, values} - set(df.columns) 200 | if missing_columns: 201 | raise ValueError(f"Columns {missing_columns} not found in the DataFrame.") 202 | 203 | # Check for duplicate combinations 204 | duplicates = df.groupby([row_name, col_name]).size() 205 | if (duplicates > 1).any(): 206 | raise ValueError( 207 | f"Found duplicate combinations of {row_name} and {col_name}. " 208 | "Use an aggregation function to handle duplicates." 209 | ) 210 | 211 | # Use pivot_table to create the matrix 212 | matrix = df.pivot_table(index=row_name, columns=col_name, values=values, aggfunc="first") 213 | 214 | # Simplified NaN handling 215 | if fillna is True: # Fill with 0 if True 216 | matrix = matrix.fillna(0) 217 | elif fillna not in [None, False]: # Fill if a specific value is provided 218 | matrix = matrix.fillna(fillna) 219 | 220 | return matrix 221 | 222 | 223 | def pivot_longer(df: pd.DataFrame, row_name: str, col_name: str, values: str) -> pd.DataFrame: 224 | """ 225 | Transforms a wide-format DataFrame into a long-format DataFrame. 226 | 227 | This function takes a DataFrame and pivots it from a wide format to a long format 228 | using the specified row name, column name, and values. It validates the input 229 | DataFrame and checks for the existence of the specified row name. The function 230 | also logs a warning if any missing values are found in the resulting DataFrame. 231 | 232 | Parameters 233 | ---------- 234 | df : pd.DataFrame 235 | The input DataFrame to be transformed. 236 | row_name : str 237 | The name of the column to use as the identifier variable. 238 | col_name : str 239 | The name for the new column that will contain the former column names. 240 | values : str 241 | The name for the new column that will contain the values. 242 | 243 | Returns 244 | ------- 245 | pd.DataFrame 246 | A long-format DataFrame with specified column names. 247 | """ 248 | # Validate input DataFrame 249 | if not isinstance(df, pd.DataFrame): 250 | raise ValueError("Input must be a pandas DataFrame") 251 | 252 | # Validate row_name exists in DataFrame 253 | if row_name not in df.columns: 254 | raise ValueError(f"Row name '{row_name}' not found in DataFrame") 255 | 256 | # Reset the index to convert the row labels to a column 257 | matrix_reset = df.reset_index() 258 | 259 | # Use pd.melt to convert the wide-format DataFrame to long-format 260 | long_df = pd.melt(matrix_reset, id_vars=[row_name], var_name=col_name, value_name=values) 261 | 262 | # Remove rows with missing values if any 263 | if long_df[values].isna().any(): 264 | logging.warning(f"Found {long_df[values].isna().sum()} missing values in the result") 265 | 266 | return long_df 267 | -------------------------------------------------------------------------------- /ibaqpy/ibaq/imputation_methods.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Union, List 2 | 3 | import pandas as pd 4 | from sklearn.impute import KNNImputer, SimpleImputer 5 | 6 | 7 | def impute_missing_values( 8 | data: Optional[Union[pd.DataFrame, List[pd.DataFrame], None]], 9 | method: str = "knn", 10 | n_neighbors: int = 5, 11 | weights: str = "uniform", 12 | metric: str = "nan_euclidean", 13 | keep_empty_features: bool = True, 14 | fill_value: float = 0.0, 15 | ) -> Union[pd.DataFrame, List[pd.DataFrame], None]: 16 | """ 17 | Impute missing values in a DataFrame or a list of DataFrames using KNN, mean, median, most frequent, or a specific value. 18 | 19 | Parameters: 20 | data : Optional[Union[pd.DataFrame, List[pd.DataFrame]]] 21 | A pandas DataFrame or a list of pandas DataFrames containing missing values to be imputed. 22 | The DataFrame(s) must adhere to the following format: 23 | - Rows represent samples (observations). 24 | - Columns represent features (variables). 25 | - Contain only numerical columns (e.g., float or int). 26 | - Missing values must be explicitly represented as `np.nan` or `pd.NA`. 27 | - Columns with non-numerical data (e.g., categorical or text) should be preprocessed 28 | (e.g., encoded into numerical values) before using this function. 29 | - Features (columns) with entirely missing values are handled based on the 30 | `keep_empty_features` parameter. 31 | method : str, optional 32 | The imputation method to use. Options are: 33 | - "knn" (default): Use K-Nearest Neighbors imputation. 34 | - "mean": Impute using the mean of each column. 35 | - "median": Impute using the median of each column. 36 | - "most_frequent": Impute using the most frequent value of each column. 37 | - "constant": Impute using a specific value provided via `fill_value`. 38 | n_neighbors : int, optional 39 | The number of neighboring samples to use for KNN imputation. Default is 5. 40 | weights : str, optional 41 | The weight function used in KNN prediction. Can be 'uniform' or 'distance'. Default is 'uniform'. 42 | metric : str, optional 43 | The distance metric used for finding neighbors in KNN. Default is 'nan_euclidean'. 44 | fill_value : float, optional 45 | The constant value to use for imputation when `method` is "constant". Default is 0.0. 46 | keep_empty_features : bool, optional 47 | Whether to keep features that are entirely empty (i.e., all values are NaN). Default is True. 48 | 49 | Returns: 50 | Union[pd.DataFrame, List[pd.DataFrame]] 51 | A pandas DataFrame or a list of pandas DataFrames with imputed missing values. 52 | If the input is None, the function will return None. 53 | 54 | Notes 55 | ----- 56 | - This function uses sklearn's KNNImputer and SimpleImputer for imputing missing values. 57 | - The `nan_euclidean` metric is specifically designed to handle NaN values during distance computation. 58 | - Column names and indices are preserved in the output. 59 | - Ensure the input data is numerical and properly formatted for the imputer. 60 | """ 61 | if data is None: 62 | # placeholder for further implementation 63 | return None 64 | 65 | if method not in {"knn", "mean", "median", "constant", "most_frequent"}: 66 | raise ValueError( 67 | "Invalid method. Choose from 'knn', 'mean', 'median', 'most_frequent', or 'constant'." 68 | ) 69 | 70 | if method == "knn": 71 | imputer = KNNImputer( 72 | n_neighbors=n_neighbors, 73 | weights=weights, 74 | metric=metric, 75 | keep_empty_features=keep_empty_features, 76 | ) 77 | else: 78 | strategy = method 79 | imputer = SimpleImputer(strategy=strategy, fill_value=fill_value) 80 | 81 | def impute(df: pd.DataFrame) -> pd.DataFrame: 82 | imputed_data = imputer.fit_transform(df) 83 | return pd.DataFrame(imputed_data, columns=df.columns, index=df.index) 84 | 85 | if isinstance(data, pd.DataFrame): 86 | # Impute missing values for a single DataFrame 87 | return impute(data) 88 | elif isinstance(data, list) and all(isinstance(df, pd.DataFrame) for df in data): 89 | # Impute missing values for a list of DataFrames 90 | return [impute(df) for df in data] 91 | else: 92 | raise ValueError( 93 | "The input data must be a pandas DataFrame, a list of DataFrames, or None." 94 | ) 95 | -------------------------------------------------------------------------------- /ibaqpy/ibaq/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | from typing import Optional, Dict, Any, Union 5 | from datetime import datetime 6 | 7 | # Default log format 8 | DEFAULT_LOG_FORMAT = "%(asctime)s [%(levelname)s] %(name)s: %(message)s" 9 | DEFAULT_DATE_FORMAT = "%Y-%m-%d %H:%M:%S" 10 | 11 | # Log levels dictionary for easy conversion from string 12 | LOG_LEVELS = { 13 | "debug": logging.DEBUG, 14 | "info": logging.INFO, 15 | "warning": logging.WARNING, 16 | "error": logging.ERROR, 17 | "critical": logging.CRITICAL, 18 | } 19 | 20 | 21 | class ContextAdapter(logging.LoggerAdapter): 22 | """ 23 | A logger adapter that adds context information to log messages. 24 | This makes logs more useful for debugging by providing additional context. 25 | """ 26 | 27 | def process(self, msg, kwargs): 28 | if self.extra: 29 | context_str = " ".join(f"{k}={v}" for k, v in self.extra.items()) 30 | return f"{msg} [{context_str}]", kwargs 31 | return msg, kwargs 32 | 33 | 34 | def get_logger( 35 | name: str, context: Optional[Dict[str, Any]] = None 36 | ) -> Union[logging.Logger, logging.LoggerAdapter]: 37 | """ 38 | Get a logger with the specified name and optional context. 39 | 40 | Args: 41 | name: The name of the logger 42 | context: Optional dictionary of context values to include in log messages 43 | 44 | Returns: 45 | A logger or logger adapter with the specified name and context 46 | """ 47 | logger = logging.getLogger(name) 48 | 49 | # If context is provided, return a ContextAdapter 50 | if context: 51 | return ContextAdapter(logger, context) 52 | 53 | return logger 54 | 55 | 56 | def configure_logging( 57 | level: str = "info", 58 | log_file: Optional[str] = None, 59 | log_format: str = DEFAULT_LOG_FORMAT, 60 | date_format: str = DEFAULT_DATE_FORMAT, 61 | propagate: bool = True, 62 | ) -> None: 63 | """ 64 | Configure the logging system for the application. 65 | 66 | Args: 67 | level: The log level (debug, info, warning, error, critical) 68 | log_file: Optional path to a log file 69 | log_format: The format string for log messages 70 | date_format: The format string for timestamps 71 | propagate: Whether to propagate logs to parent loggers 72 | """ 73 | # Convert level string to logging level 74 | log_level = LOG_LEVELS.get(level.lower(), logging.INFO) 75 | 76 | # Configure root logger 77 | root_logger = logging.getLogger() 78 | root_logger.setLevel(log_level) 79 | 80 | # Remove existing handlers to avoid duplicate logs 81 | for handler in root_logger.handlers[:]: 82 | root_logger.removeHandler(handler) 83 | 84 | # Create formatter 85 | formatter = logging.Formatter(log_format, date_format) 86 | 87 | # Configure console handler 88 | console_handler = logging.StreamHandler(sys.stdout) 89 | console_handler.setFormatter(formatter) 90 | root_logger.addHandler(console_handler) 91 | 92 | # Configure file handler if log_file is specified 93 | if log_file: 94 | os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True) 95 | file_handler = logging.FileHandler(log_file) 96 | file_handler.setFormatter(formatter) 97 | root_logger.addHandler(file_handler) 98 | 99 | # Configure ibaqpy loggers 100 | ibaqpy_logger = logging.getLogger("ibaqpy") 101 | ibaqpy_logger.setLevel(log_level) 102 | ibaqpy_logger.propagate = propagate 103 | 104 | 105 | def log_execution_time( 106 | logger: Union[logging.Logger, logging.LoggerAdapter], level: int = logging.INFO 107 | ): 108 | """ 109 | Decorator to log the execution time of a function. 110 | 111 | Args: 112 | logger: The logger to use 113 | level: The log level to use 114 | 115 | Returns: 116 | A decorator that logs the execution time of the decorated function 117 | """ 118 | 119 | def decorator(func): 120 | def wrapper(*args, **kwargs): 121 | start_time = datetime.now() 122 | logger.log(level, f"Starting {func.__name__}") 123 | 124 | try: 125 | result = func(*args, **kwargs) 126 | end_time = datetime.now() 127 | execution_time = end_time - start_time 128 | logger.log(level, f"Completed {func.__name__} in {execution_time}") 129 | return result 130 | except Exception as e: 131 | end_time = datetime.now() 132 | execution_time = end_time - start_time 133 | logger.exception(f"Error in {func.__name__} after {execution_time}: {str(e)}") 134 | raise 135 | 136 | return wrapper 137 | 138 | return decorator 139 | 140 | 141 | def log_function_call( 142 | logger: Union[logging.Logger, logging.LoggerAdapter], level: int = logging.DEBUG 143 | ): 144 | """ 145 | Decorator to log function calls with arguments. 146 | 147 | Args: 148 | logger: The logger to use 149 | level: The log level to use 150 | 151 | Returns: 152 | A decorator that logs function calls with arguments 153 | """ 154 | 155 | def decorator(func): 156 | def wrapper(*args, **kwargs): 157 | args_str = ", ".join([str(arg) for arg in args]) 158 | kwargs_str = ", ".join([f"{k}={v}" for k, v in kwargs.items()]) 159 | all_args = ", ".join(filter(None, [args_str, kwargs_str])) 160 | 161 | logger.log(level, f"Calling {func.__name__}({all_args})") 162 | return func(*args, **kwargs) 163 | 164 | return wrapper 165 | 166 | return decorator -------------------------------------------------------------------------------- /ibaqpy/ibaq/logging_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Logging configuration for the ibaqpy package. 3 | 4 | This module provides functions to configure the logging system for the ibaqpy package. 5 | It should be imported and initialized at the start of the application. 6 | """ 7 | 8 | import os 9 | import logging 10 | from typing import Optional 11 | 12 | from ibaqpy.ibaq.logger import configure_logging 13 | 14 | 15 | def initialize_logging( 16 | level: str = "info", 17 | log_file: Optional[str] = None, 18 | log_format: Optional[str] = None, 19 | date_format: Optional[str] = None, 20 | ) -> None: 21 | """ 22 | Initialize the logging system for the ibaqpy package. 23 | 24 | This function should be called at the start of the application to configure 25 | the logging system. It sets up console and file logging with appropriate 26 | formatting. 27 | 28 | Args: 29 | level: The log level (debug, info, warning, error, critical) 30 | log_file: Optional path to a log file 31 | log_format: Optional format string for log messages 32 | date_format: Optional format string for timestamps 33 | """ 34 | # Use environment variables if available 35 | env_level = os.environ.get("IBAQPY_LOG_LEVEL", level) 36 | env_log_file = os.environ.get("IBAQPY_LOG_FILE", log_file) 37 | 38 | # Configure logging 39 | configure_logging( 40 | level=env_level, 41 | log_file=env_log_file, 42 | log_format=log_format, 43 | date_format=date_format, 44 | ) 45 | 46 | # Log initialization 47 | logger = logging.getLogger("ibaqpy") 48 | logger.info("Logging initialized at level %s", env_level.upper()) 49 | if env_log_file: 50 | logger.info("Log file: %s", env_log_file) 51 | 52 | 53 | def get_log_file_path(base_dir: Optional[str] = None) -> str: 54 | """ 55 | Get a default log file path based on the current date. 56 | 57 | Args: 58 | base_dir: Optional base directory for log files 59 | 60 | Returns: 61 | A path to a log file 62 | """ 63 | import datetime 64 | 65 | # Default to logs directory in current working directory 66 | if base_dir is None: 67 | base_dir = os.path.join(os.getcwd(), "logs") 68 | 69 | # Create logs directory if it doesn't exist 70 | os.makedirs(base_dir, exist_ok=True) 71 | 72 | # Create log file name based on current date 73 | date_str = datetime.datetime.now().strftime("%Y-%m-%d") 74 | log_file = os.path.join(base_dir, f"ibaqpy_{date_str}.log") 75 | 76 | return log_file -------------------------------------------------------------------------------- /ibaqpy/ibaq/write_queue.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from threading import Thread 4 | from queue import Queue, Empty 5 | from typing import Any 6 | 7 | import pandas as pd 8 | import pyarrow as pa 9 | from pyarrow import parquet as pq 10 | 11 | from .logger import get_logger 12 | 13 | # Get a logger for this module 14 | logger = get_logger("ibaqpy.write_queue") 15 | 16 | 17 | class WriteCSVTask(Thread): 18 | """ 19 | A thread-based task for writing pandas DataFrames to a CSV file. 20 | 21 | This class extends the Thread class to asynchronously write DataFrames 22 | to a specified CSV file path. It manages a queue to handle incoming 23 | DataFrames and writes them to the file in the order they are received. 24 | The CSV file is created with an optional header, and additional write 25 | options can be specified. 26 | 27 | Attributes: 28 | path (str): The file path where the CSV will be written. 29 | write_options (dict[str, Any]): Options for writing the CSV file. 30 | _queue (Queue): A queue to manage DataFrames to be written. 31 | _wrote_header (bool): Indicates if the CSV header has been written. 32 | 33 | Methods: 34 | write(table: pd.DataFrame): Adds a DataFrame to the queue for writing. 35 | close(): Signals the thread to finish processing and close. 36 | run(): Continuously processes the queue to write DataFrames to the CSV. 37 | """ 38 | 39 | path: str 40 | 41 | write_options: dict[str, Any] 42 | 43 | _queue: Queue 44 | _wrote_header: bool 45 | 46 | def __init__(self, path: str, daemon: bool = True, write_options: dict = None, **kwargs): 47 | """ 48 | Initializes a WriteCSVTask instance. 49 | 50 | Parameters: 51 | path (str): The file path where the CSV will be written. The extension 52 | will be automatically set to '.csv'. 53 | daemon (bool, optional): Whether the thread should be a daemon thread. 54 | Defaults to True. 55 | write_options (dict, optional): Additional options for writing the CSV 56 | file. Defaults to None. 57 | **kwargs: Additional keyword arguments to be merged with write_options. 58 | 59 | Attributes: 60 | path (str): The file path for the CSV file. 61 | write_options (dict): Options for writing the CSV file. 62 | _wrote_header (bool): Indicates if the CSV header has been written. 63 | _queue (Queue): A queue to manage DataFrames to be written. 64 | """ 65 | super().__init__(daemon=daemon) 66 | if write_options is None: 67 | write_options = {} 68 | 69 | path, _ext = os.path.splitext(path) 70 | path += ".csv" 71 | 72 | self.path = path 73 | self.write_options = write_options | kwargs 74 | self._wrote_header = False 75 | self._queue = Queue() 76 | 77 | def write(self, table: pd.DataFrame): 78 | """ 79 | Adds a DataFrame to the queue for writing to the CSV file. 80 | 81 | Parameters: 82 | table (pd.DataFrame): The DataFrame to be added to the queue. 83 | """ 84 | logger.debug("Queuing %d rows for CSV writing to %s", len(table), self.path) 85 | self._queue.put(table) 86 | 87 | def close(self): 88 | """ 89 | Signals the thread to finish processing and close the file. 90 | """ 91 | logger.debug("Closing CSV writer queue for %s", self.path) 92 | self._queue.put(None) 93 | self.join() 94 | 95 | def _write(self, table: pd.DataFrame): 96 | """ 97 | Writes a DataFrame to the CSV file specified by the path attribute. 98 | 99 | This method appends the DataFrame to the CSV file if the header has 100 | already been written; otherwise, it writes the DataFrame with the header. 101 | The writing options are specified by the write_options attribute. 102 | 103 | Parameters: 104 | table (pd.DataFrame): The DataFrame to be written to the CSV file. 105 | """ 106 | start_time = time.time() 107 | rows = len(table) 108 | 109 | try: 110 | table.to_csv( 111 | self.path, 112 | header=not self._wrote_header, 113 | mode="a+" if self._wrote_header else "w", 114 | index=False, 115 | **self.write_options, 116 | ) 117 | self._wrote_header = True 118 | 119 | elapsed = time.time() - start_time 120 | logger.debug("Wrote %d rows to CSV file %s in %.2f seconds", rows, self.path, elapsed) 121 | except Exception as e: 122 | logger.error("Error writing to CSV file %s: %s", self.path, str(e)) 123 | raise 124 | 125 | def _close(self): 126 | logger.debug("Closing CSV writer for %s", self.path) 127 | 128 | def run(self): 129 | """ 130 | Continuously processes the queue to write DataFrames to the CSV file. 131 | 132 | This method runs in a loop, retrieving DataFrames from the queue and 133 | writing them to the CSV file using the _write method. The loop exits 134 | when a None value is encountered in the queue, signaling the end of 135 | the writing process. 136 | """ 137 | while True: 138 | try: 139 | table: pd.DataFrame = self._queue.get(True) 140 | except Empty: 141 | continue 142 | 143 | if table is None: 144 | break 145 | 146 | self._write(table) 147 | 148 | 149 | class WriteParquetTask(Thread): 150 | """ 151 | A thread-based task for writing pandas DataFrames to a Parquet file. 152 | 153 | This class extends the Thread class to asynchronously write DataFrames 154 | to a Parquet file using a queue. It manages the ParquetWriter and schema 155 | internally, ensuring that data is written efficiently and safely. 156 | 157 | Attributes: 158 | path (str): The file path where the Parquet file will be written. 159 | metadata (dict[str, Any]): Metadata to be added to the Parquet file. 160 | _queue (Queue): A queue to hold DataFrames to be written. 161 | _schema (pa.Schema): The schema of the Parquet file. 162 | _writer (pq.ParquetWriter): The writer object for the Parquet file. 163 | 164 | Methods: 165 | write(table: pd.DataFrame): Adds a DataFrame to the queue for writing. 166 | close(): Signals the thread to finish writing and close the file. 167 | run(): The main loop of the thread, processing the queue. 168 | """ 169 | 170 | path: str 171 | metadata: dict[str, Any] 172 | 173 | _queue: Queue 174 | 175 | _schema: pa.Schema 176 | _writer: pq.ParquetWriter 177 | 178 | def __init__(self, path: str, daemon: bool = True, metadata: dict = None, **kwargs): 179 | super().__init__(daemon=daemon) 180 | 181 | if metadata is None: 182 | metadata = {} 183 | path, _ext = os.path.splitext(path) 184 | path += ".parquet" 185 | 186 | self.path = path 187 | self.metadata = metadata | kwargs 188 | self._queue = Queue() 189 | self._writer = None 190 | self._schema = None 191 | 192 | def write(self, table: pd.DataFrame): 193 | """ 194 | Adds a DataFrame to the queue for writing to the Parquet file. 195 | 196 | Parameters: 197 | table (pd.DataFrame): The DataFrame to be added to the queue. 198 | """ 199 | logger.debug("Queuing %d rows for Parquet writing to %s", len(table), self.path) 200 | self._queue.put(table) 201 | 202 | def close(self): 203 | """ 204 | Signals the thread to finish processing and close the file. 205 | """ 206 | logger.debug("Closing Parquet writer queue for %s", self.path) 207 | self._queue.put(None) 208 | self.join() 209 | 210 | def _close(self): 211 | logger.debug("Closing Parquet writer for %s", self.path) 212 | self._writer.add_key_value_metadata(self.metadata) 213 | self._writer.close() 214 | 215 | def _write(self, table: pd.DataFrame): 216 | start_time = time.time() 217 | rows = len(table) 218 | 219 | try: 220 | if self._schema is None: 221 | self._schema = pa.Schema.from_pandas(table, preserve_index=False) 222 | self._writer = pq.ParquetWriter(self.path, schema=self._schema) 223 | logger.debug("Initialized Parquet writer for %s", self.path) 224 | 225 | arrow_table = pa.Table.from_pandas(table, preserve_index=False) 226 | self._writer.write_table(arrow_table) 227 | 228 | elapsed = time.time() - start_time 229 | logger.debug( 230 | "Wrote %d rows to Parquet file %s in %.2f seconds", rows, self.path, elapsed 231 | ) 232 | except Exception as e: 233 | logger.error("Error writing to Parquet file %s: %s", self.path, str(e)) 234 | raise 235 | 236 | def run(self): 237 | while True: 238 | try: 239 | table: pd.DataFrame = self._queue.get(True) 240 | except Empty: 241 | continue 242 | 243 | if table is None: 244 | break 245 | 246 | self._write(table) 247 | 248 | self._close() -------------------------------------------------------------------------------- /ibaqpy/ibaqpyc.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | import click 4 | from ibaqpy.commands.features2peptides import features2parquet 5 | from ibaqpy.commands.peptides2protein import peptides2protein 6 | from ibaqpy.commands.tsne_visualization import tsne_visualization 7 | from ibaqpy.commands.correct_batches import correct_batches 8 | 9 | import ibaqpy.__init__ as __init__ 10 | 11 | CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) 12 | 13 | 14 | LOG_LEVELS = ["debug", "info", "warn"] 15 | LOG_LEVELS_TO_LEVELS = { 16 | "debug": logging.DEBUG, 17 | "info": logging.INFO, 18 | "warn": logging.WARN, 19 | } 20 | 21 | 22 | @click.group(context_settings=CONTEXT_SETTINGS) 23 | @click.version_option( 24 | version=__init__.__version__, 25 | package_name="ibaqpy", 26 | message="%(package)s %(version)s", 27 | ) 28 | @click.option( 29 | "-v", 30 | "--log-level", 31 | type=click.Choice(LOG_LEVELS, False), 32 | default="debug", 33 | help="Set the logging level.", 34 | ) 35 | @click.option( 36 | "--log-file", 37 | type=click.Path(writable=True, path_type=Path), 38 | required=False, 39 | help="Write log to this file.", 40 | ) 41 | def cli(log_level: str, log_file: Path): 42 | """ 43 | Aggregrate and normalize quantitative proteomics using iBAQ (Intensity-Based Absolute Quantification) 44 | for the quantms ecosystem. 45 | """ 46 | 47 | logging.basicConfig( 48 | format="%(asctime)s [%(funcName)s] - %(message)s", 49 | level=LOG_LEVELS_TO_LEVELS[log_level.lower()], 50 | ) 51 | logging.captureWarnings(True) 52 | 53 | if log_file: 54 | if not log_file.exists(): 55 | if not log_file.parent.exists(): 56 | log_file.parent.mkdir(parents=True, exist_ok=True) 57 | handler = logging.FileHandler(log_file) 58 | handler.setLevel(LOG_LEVELS_TO_LEVELS[log_level.lower()]) 59 | handler.setFormatter(logging.Formatter("%(asctime)s [%(funcName)s] - %(message)s")) 60 | logging.getLogger().addHandler(handler) 61 | 62 | 63 | cli.add_command(features2parquet) 64 | cli.add_command(peptides2protein) 65 | cli.add_command(tsne_visualization) 66 | cli.add_command(correct_batches) 67 | 68 | 69 | def main(): 70 | """ 71 | Main function to run the CLI 72 | """ 73 | try: 74 | cli() 75 | except SystemExit as e: 76 | if e.code != 0: 77 | raise 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /ibaqpy/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/ibaqpy/model/__init__.py -------------------------------------------------------------------------------- /ibaqpy/model/normalization.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | from enum import Enum, auto 3 | 4 | import pandas as pd 5 | 6 | from ibaqpy.ibaq.ibaqpy_commons import CONDITION, NORM_INTENSITY, SAMPLE_ID, TECHREPLICATE 7 | 8 | _method_registry: dict["FeatureNormalizationMethod", Callable[[pd.Series], pd.Series]] = {} 9 | 10 | 11 | class FeatureNormalizationMethod(Enum): 12 | """ 13 | FeatureNormalizationMethod is an enumeration of various methods for normalizing 14 | replicate intensities in a DataFrame. It provides functionality to register 15 | custom normalization functions and apply them to replicate data. The class 16 | supports normalization across multiple runs and samples, adjusting replicate 17 | intensities based on a sample-level average metric. Methods include NONE, Mean, 18 | Median, Max, Global, Max_Min, and IQR. The class also includes utility methods 19 | to convert from string representations and to normalize data using registered 20 | functions. 21 | """ 22 | 23 | NONE = auto() 24 | 25 | Mean = auto() 26 | Median = auto() 27 | Max = auto() 28 | Global = auto() 29 | Max_Min = auto() 30 | IQR = auto() 31 | 32 | @classmethod 33 | def from_str(cls, name: str) -> "FeatureNormalizationMethod": 34 | """ 35 | Get the normalization method from a string. 36 | Parameters: 37 | name: str The name of the normalization method. 38 | 39 | Returns: 40 | FeatureNormalizationMethod: The normalization method. 41 | """ 42 | if name is None: 43 | return cls.NONE 44 | name_ = name.lower() 45 | for k, v in cls._member_map_.items(): 46 | if k.lower() == name_: 47 | return v 48 | raise KeyError(name) 49 | 50 | def register_replicate_fn( 51 | self, fn: Callable[[pd.Series], pd.Series] 52 | ) -> Callable[[pd.Series], pd.Series]: 53 | """ 54 | Registers a custom normalization function for replicate intensities. 55 | 56 | Parameters: 57 | fn (Callable[[pd.Series], pd.Series]): A function that takes a pandas Series 58 | and returns a normalized pandas Series. 59 | 60 | Returns: 61 | Callable[[pd.Series], pd.Series]: The registered normalization function. 62 | """ 63 | _method_registry[self] = fn 64 | return fn 65 | 66 | def normalize_replicates(self, df: pd.DataFrame, *args, **kwargs): 67 | """ 68 | Normalize the replicate intensities in the given DataFrame using a registered 69 | normalization function. 70 | 71 | Parameters: 72 | df (pd.DataFrame): The DataFrame containing replicate intensity data. 73 | *args: Additional positional arguments for the normalization function. 74 | **kwargs: Additional keyword arguments for the normalization function. 75 | 76 | Returns: 77 | pd.Series: The normalized replicate intensities. 78 | """ 79 | fn = _method_registry[self] 80 | return fn(df, *args, **kwargs) 81 | 82 | def normalize_sample(self, df, runs: list[str]) -> tuple[dict[str, pd.Series], float]: 83 | """ 84 | Normalize replicate intensities for a given sample across multiple runs. 85 | 86 | Parameters: 87 | df (pd.DataFrame): The DataFrame containing replicate intensity data. 88 | runs (list[str]): A list of run identifiers for the sample. 89 | 90 | Returns: 91 | tuple[dict[str, pd.Series], float]: A dictionary mapping each run to its 92 | normalized replicate intensities and the average metric across all runs. 93 | """ 94 | map_ = {} 95 | total = 0 96 | for run in runs: 97 | run = str(run) 98 | run_m = self.normalize_replicates(df.loc[df[TECHREPLICATE] == run, NORM_INTENSITY]) 99 | map_[run] = run_m 100 | total += run_m 101 | sample_average_metric = total / len(runs) 102 | return map_, sample_average_metric 103 | 104 | def normalize_runs(self, df: pd.DataFrame, technical_replicates: int): 105 | """ 106 | Normalize the intensities of runs in the given DataFrame using a registered 107 | 108 | Parameters: 109 | df: pd.DataFrame The DataFrame containing replicate intensity data. 110 | technical_replicates: int The number of technical replicates for each sample. 111 | 112 | Returns: 113 | pd.DataFrame: The DataFrame with normalized replicate intensities. 114 | 115 | """ 116 | if technical_replicates > 1: 117 | samples = df[SAMPLE_ID].unique() 118 | for sample in samples: 119 | runs = df.loc[df[SAMPLE_ID] == sample, TECHREPLICATE].unique().tolist() 120 | if len(runs) > 1: 121 | sample_df = df.loc[df[SAMPLE_ID] == sample, :] 122 | 123 | replicate_metric_map, sample_average_metric = self.normalize_sample( 124 | sample_df, runs 125 | ) 126 | 127 | # For each replicate in each sample, normalize the per-replicate 128 | # intensity by a replicate-level statistic, relative to the sample 129 | # average over that replicate statistic. 130 | # 131 | # In effect, this scales runs down when the replicate average > sample average 132 | # and scales runs up when the replicate average < sample average. 133 | for run in runs: 134 | run = str(run) 135 | run_intensity = df.loc[ 136 | (df[SAMPLE_ID] == sample) & (df[TECHREPLICATE] == run), 137 | NORM_INTENSITY, 138 | ] 139 | df.loc[ 140 | (df[SAMPLE_ID] == sample) & (df[TECHREPLICATE] == run), 141 | NORM_INTENSITY, 142 | ] = run_intensity / (replicate_metric_map[run] / sample_average_metric) 143 | return df 144 | else: 145 | return df 146 | 147 | def __call__(self, df: pd.DataFrame, technical_replicates: int): 148 | return self.normalize_runs(df, technical_replicates) 149 | 150 | 151 | @FeatureNormalizationMethod.NONE.register_replicate_fn 152 | def no_normalization(df, *args, **kwargs): 153 | """ 154 | No normalization is performed on the data. 155 | Parameters: 156 | df: pd.DataFrame The DataFrame containing replicate intensity data. 157 | args: Additional positional arguments 158 | kwargs: Additional keyword arguments 159 | 160 | Returns: 161 | pd.DataFrame: The DataFrame containing the replicate intensity data. 162 | 163 | """ 164 | return df 165 | 166 | 167 | @FeatureNormalizationMethod.Mean.register_replicate_fn 168 | def mean_normalize(df, *args, **kwargs): 169 | """ 170 | Mean normalization of the data. 171 | 172 | Parameters: 173 | df: pd.DataFrame The DataFrame containing replicate intensity data. 174 | args: Additional positional arguments 175 | kwargs: Additional keyword arguments 176 | 177 | Returns: 178 | pd.DataFrame: The DataFrame containing the normalized replicate intensity data. 179 | 180 | """ 181 | return df / df.mean() 182 | 183 | 184 | @FeatureNormalizationMethod.Median.register_replicate_fn 185 | def median_normalize(df, *args, **kwargs): 186 | """ 187 | Median normalization of the data. 188 | Parameters: 189 | df: pd.DataFrame The DataFrame containing replicate intensity data. 190 | args: Additional positional arguments 191 | kwargs: Additional keyword arguments 192 | 193 | Returns: 194 | pd.DataFrame: The DataFrame containing the normalized replicate intensity data. 195 | 196 | """ 197 | return df / df.median() 198 | 199 | 200 | @FeatureNormalizationMethod.Max.register_replicate_fn 201 | def max_normalize(df, *args, **kwargs): 202 | """ 203 | Max normalization of the data. 204 | Parameters: 205 | df: pd.DataFrame The DataFrame containing replicate intensity data. 206 | args: Additional positional arguments 207 | kwargs: Additional keyword arguments 208 | 209 | Returns: 210 | pd.DataFrame: The DataFrame containing the normalized replicate intensity data. 211 | """ 212 | return df / df.max() 213 | 214 | 215 | @FeatureNormalizationMethod.Global.register_replicate_fn 216 | def global_normalize(df, *args, **kwargs): 217 | """ 218 | Global normalization of the data. 219 | Parameters: 220 | df: pd.DataFrame The DataFrame containing replicate intensity data. 221 | args: Additional positional arguments 222 | kwargs: Additional keyword arguments 223 | 224 | Returns: 225 | pd.DataFrame: The DataFrame containing the normalized replicate intensity data. 226 | """ 227 | return df / df.sum() 228 | 229 | 230 | @FeatureNormalizationMethod.Max_Min.register_replicate_fn 231 | def max_min_normalize(df, *args, **kwargs): 232 | """ 233 | Max-Min normalization of the data 234 | Parameters: 235 | df: pd.DataFrame The DataFrame containing replicate intensity data. 236 | args: Additional positional arguments 237 | kwargs: Additional keyword arguments 238 | 239 | Returns: 240 | pd.DataFrame: The DataFrame containing the normalized replicate intensity data. 241 | """ 242 | min_ = df.min() 243 | return (df - min_) / (df.max() - min_) 244 | 245 | 246 | @FeatureNormalizationMethod.IQR.register_replicate_fn 247 | def iqr_normalization(df, *args, **kwargs): 248 | """ 249 | IQR normalization of the data. 250 | Parameters: 251 | df: pd.DataFrame The DataFrame containing replicate intensity data. 252 | args: Additional positional arguments 253 | kwargs: Additional keyword arguments 254 | 255 | Returns: 256 | pd.DataFrame: The DataFrame containing the normalized replicate intensity data. 257 | """ 258 | return df.quantile([0.75, 0.25], interpolation="linear").mean() 259 | 260 | 261 | _peptide_method_registry = {} 262 | 263 | 264 | class PeptideNormalizationMethod(Enum): 265 | """ 266 | Enum class for peptide normalization methods, providing functionality to register 267 | and apply normalization functions to peptide data. 268 | 269 | Attributes: 270 | NONE: No normalization. 271 | GlobalMedian: Normalization using global median. 272 | ConditionMedian: Normalization using condition-specific median. 273 | 274 | Methods: 275 | from_str(name): Converts a string to a PeptideNormalizationMethod. 276 | register_replicate_fn(fn): Registers a function for a specific normalization method. 277 | normalize_sample(dataset_df, sample, med_map): Applies the registered normalization 278 | function to a sample. 279 | __call__(dataset_df, sample, med_map): Invokes normalize_sample method. 280 | """ 281 | 282 | NONE = auto() 283 | 284 | GlobalMedian = auto() 285 | ConditionMedian = auto() 286 | 287 | @classmethod 288 | def from_str(cls, name: str) -> "PeptideNormalizationMethod": 289 | """ 290 | Converts a string to a PeptideNormalizationMethod. 291 | Parameters: 292 | name: str The name of the normalization method. 293 | 294 | Returns: 295 | PeptideNormalizationMethod: The normalization method. 296 | """ 297 | name_ = name.lower() 298 | for k, v in cls._member_map_.items(): 299 | if k.lower() == name_: 300 | return v 301 | raise KeyError(name) 302 | 303 | def register_replicate_fn( 304 | self, fn: Callable[[pd.DataFrame, str, dict], pd.DataFrame] 305 | ) -> Callable[[pd.DataFrame, str, dict], pd.DataFrame]: 306 | """ 307 | Registers a function for a specific normalization method. 308 | Parameters: 309 | fn: Callable[[pd.DataFrame, str, dict], pd.DataFrame] The normalization function. 310 | 311 | Returns: 312 | Callable[[pd.DataFrame, str, dict], pd.DataFrame]: The normalization function. 313 | """ 314 | _peptide_method_registry[self] = fn 315 | return fn 316 | 317 | def normalize_sample(self, dataset_df: pd.DataFrame, sample: str, med_map: dict): 318 | """ 319 | Applies the registered normalization function to a sample. 320 | Parameters: 321 | dataset_df: pd.DataFrame The DataFrame containing peptide intensity data. 322 | sample: str The sample identifier. 323 | med_map: dict The median map. 324 | 325 | Returns: 326 | pd.DataFrame: The DataFrame containing the normalized peptide intensity data. 327 | """ 328 | fn = _peptide_method_registry[self] 329 | return fn(dataset_df, sample, med_map) 330 | 331 | def __call__(self, dataset_df: pd.DataFrame, sample: str, med_map: dict): 332 | """ 333 | Invokes the normalize_sample method. 334 | Parameters: 335 | dataset_df: pd.DataFrame The DataFrame containing peptide intensity data. 336 | sample: str The sample identifier. 337 | med_map: dict The median map. 338 | 339 | Returns: 340 | pd.DataFrame: The DataFrame containing the normalized peptide intensity data. 341 | """ 342 | return self.normalize_sample(dataset_df, sample, med_map) 343 | 344 | 345 | @PeptideNormalizationMethod.GlobalMedian.register_replicate_fn 346 | def global_median(dataset_df, sample: str, med_map: dict): 347 | """ 348 | Global median normalization of the data. 349 | Parameters: 350 | dataset_df: pd.DataFrame The DataFrame containing peptide intensity data. 351 | sample: str The sample identifier. 352 | med_map: dict The median map. 353 | 354 | Returns: 355 | pd.DataFrame: The DataFrame containing the normalized peptide intensity data. 356 | """ 357 | dataset_df.loc[:, NORM_INTENSITY] = dataset_df[NORM_INTENSITY] / med_map[sample] 358 | return dataset_df 359 | 360 | 361 | @PeptideNormalizationMethod.ConditionMedian.register_replicate_fn 362 | def condition_median(dataset_df, sample: str, med_map: dict): 363 | """ 364 | Condition median normalization of the data. 365 | Parameters: 366 | dataset_df: pd.DataFrame The DataFrame containing peptide intensity data. 367 | sample: str The sample identifier. 368 | med_map: dict The median map. 369 | 370 | Returns: 371 | pd.DataFrame: The DataFrame containing the normalized peptide intensity data. 372 | """ 373 | con = dataset_df[CONDITION].unique()[0] 374 | dataset_df.loc[:, NORM_INTENSITY] = dataset_df[NORM_INTENSITY] / med_map[con][sample] 375 | 376 | 377 | @PeptideNormalizationMethod.NONE.register_replicate_fn 378 | def peptide_no_normalization(dataset_df, sample, med_map): 379 | """ 380 | No normalization is performed on the data. 381 | Parameters: 382 | dataset_df: pd.DataFrame The DataFrame containing peptide intensity data. 383 | sample: str The sample identifier. 384 | med_map: dict The median map. 385 | 386 | Returns: 387 | pd.DataFrame: The DataFrame containing the peptide intensity data. 388 | """ 389 | return dataset_df 390 | -------------------------------------------------------------------------------- /ibaqpy/model/organism_metadata.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from dataclasses import dataclass, field 4 | from importlib.resources import open_text 5 | from typing import ClassVar, Optional 6 | 7 | 8 | @dataclass 9 | class OrganismDescription: 10 | """ 11 | Represents an organism's metadata, including its name, genome size, and histone proteins. 12 | 13 | Attributes 14 | ---------- 15 | registry (ClassVar[dict[str, OrganismDescription]]): A class-level dictionary to store 16 | registered organisms by their name in uppercase. 17 | name (str): The name of the organism. 18 | genome_size (int): The size of the organism's genome. 19 | histone_proteins (list[str]): A list of histone proteins associated with the organism. 20 | histone_entries (list[str]): A list of histone entries associated with the organism. 21 | 22 | Methods 23 | ------- 24 | get(key, default=None) -> Optional[OrganismDescription]: Retrieves an organism description 25 | from the registry using the given key, returning the default if not found. 26 | registered_organisms(): Returns a list of all registered organism names. 27 | """ 28 | 29 | registry: ClassVar[dict[str, "OrganismDescription"]] = {} 30 | 31 | name: str 32 | genome_size: int 33 | histone_proteins: list[str] = field(default_factory=list, repr=False) 34 | histone_entries: list[str] = field(default_factory=list, repr=False) 35 | 36 | @classmethod 37 | def get(cls, key, default=None) -> "Optional[OrganismDescription]": 38 | """ 39 | Retrieve an organism description from the registry using the given key. 40 | 41 | Parameters: 42 | key (str): The name of the organism to retrieve, case-insensitive. 43 | default (Optional[OrganismDescription]): The value to return if the organism is not found. 44 | 45 | Returns: 46 | Optional[OrganismDescription]: The organism description if found, otherwise the default value. 47 | """ 48 | return cls.registry.get(key.upper(), default) 49 | 50 | @classmethod 51 | def registered_organisms(cls): 52 | """ 53 | Return a list of all registered organism names. 54 | 55 | Returns: 56 | dict_keys: A view of the registry's keys representing organism names. 57 | """ 58 | return cls.registry.keys() 59 | 60 | def __post_init__(self): 61 | """ 62 | Automatically register the organism in the class-level registry 63 | using its name in uppercase as the key. 64 | """ 65 | self.registry[self.name.upper()] = self 66 | 67 | 68 | for v in json.load(open_text("ibaqpy.data", "organisms.json")).values(): 69 | OrganismDescription(**v) 70 | -------------------------------------------------------------------------------- /ibaqpy/model/quantification_type.py: -------------------------------------------------------------------------------- 1 | from enum import Enum, auto 2 | 3 | from collections.abc import Mapping 4 | from dataclasses import dataclass, field 5 | from typing import ClassVar, Iterator, Union, Optional 6 | 7 | 8 | class QuantificationCategory(Enum): 9 | """ 10 | An enumeration representing different quantification categories used in proteomics. 11 | 12 | Attributes: 13 | TMT: Represents Tandem Mass Tag quantification. 14 | ITRAQ: Represents Isobaric Tags for Relative and Absolute Quantitation. 15 | LFQ: Represents Label-Free Quantification. 16 | 17 | Methods: 18 | from_str(name: str) -> QuantificationCategory: 19 | Converts a string to a QuantificationCategory enum member. 20 | 21 | classify(labels: Union[list[str], set[str]]) -> tuple[Optional[QuantificationCategory], Optional[IsobaricLabel]]: 22 | Classifies a set of labels into a quantification category and determines the isobaric label scheme. 23 | """ 24 | 25 | TMT = auto() 26 | ITRAQ = auto() 27 | LFQ = auto() 28 | 29 | @classmethod 30 | def from_str(cls, name: str) -> "QuantificationCategory": 31 | """ 32 | Converts a string representation of a quantification category to its corresponding 33 | QuantificationCategory enum member. 34 | 35 | Parameters: 36 | name (str): The name of the quantification category. 37 | 38 | Returns: 39 | QuantificationCategory: The corresponding enum member. 40 | 41 | Raises: 42 | KeyError: If the provided name does not match any quantification category. 43 | """ 44 | name_ = name.lower() 45 | for k, v in cls._member_map_.items(): 46 | if k.lower() == name_: 47 | return v 48 | raise KeyError(name) 49 | 50 | @classmethod 51 | def classify( 52 | cls, labels: Union[list[str], set[str]] 53 | ) -> tuple["Optional[QuantificationCategory]", "Optional[IsobaricLabel]"]: 54 | """ 55 | Classifies a set of labels into a quantification category and determines the isobaric label scheme. 56 | 57 | Parameters: 58 | labels (Union[list[str], set[str]]): A collection of label strings to classify. 59 | 60 | Returns: 61 | tuple[Optional[QuantificationCategory], Optional[IsobaricLabel]]: 62 | A tuple containing the quantification category and the isobaric label scheme, if applicable. 63 | 64 | Raises: 65 | ValueError: If the labels do not correspond to a known quantification category. 66 | """ 67 | label_scheme = None 68 | 69 | if len(labels) == 1 and any("label free" in s.lower() for s in labels): 70 | label_category = cls.LFQ 71 | 72 | elif any("tmt" in s.lower() for s in labels): 73 | label_category = cls.TMT 74 | if ( 75 | len(labels) > 11 76 | or "TMT134N" in labels 77 | or "TMT133C" in labels 78 | or "TMT133N" in labels 79 | or "TMT132C" in labels 80 | or "TMT132N" in labels 81 | ): 82 | label_scheme = IsobaricLabel.TMT16plex 83 | elif len(labels) == 11 or "TMT131C" in labels: 84 | label_scheme = IsobaricLabel.TMT11plex 85 | elif len(labels) > 6: 86 | label_scheme = IsobaricLabel.TMT10plex 87 | else: 88 | label_scheme = IsobaricLabel.TMT6plex 89 | 90 | elif any("itraq" in s.lower() for s in labels): 91 | label_category = cls.ITRAQ 92 | if len(labels) > 4: 93 | label_scheme = IsobaricLabel.ITRAQ8plex 94 | else: 95 | label_scheme = IsobaricLabel.ITRAQ4plex 96 | 97 | else: 98 | raise ValueError( 99 | f"Cannot infer labeling scheme from {labels}, only support label free, TMT and ITRAQ experiment!" 100 | ) 101 | return label_category, label_scheme 102 | 103 | 104 | class IsobaricLabel(Enum): 105 | """ 106 | An enumeration for different isobaric labeling schemes used in proteomics. 107 | 108 | Attributes: 109 | TMT6plex: Represents the TMT 6-plex labeling scheme. 110 | TMT10plex: Represents the TMT 10-plex labeling scheme. 111 | TMT11plex: Represents the TMT 11-plex labeling scheme. 112 | TMT16plex: Represents the TMT 16-plex labeling scheme. 113 | ITRAQ4plex: Represents the ITRAQ 4-plex labeling scheme. 114 | ITRAQ8plex: Represents the ITRAQ 8-plex labeling scheme. 115 | 116 | Methods: 117 | from_str(name: str) -> IsobaricLabel: 118 | Converts a string to an IsobaricLabel enum member. 119 | channels() -> IsobaricLabelSpec: 120 | Retrieves the channel specifications for the isobaric label. 121 | """ 122 | 123 | TMT6plex = auto() 124 | TMT10plex = auto() 125 | TMT11plex = auto() 126 | TMT16plex = auto() 127 | 128 | ITRAQ4plex = auto() 129 | ITRAQ8plex = auto() 130 | 131 | @classmethod 132 | def from_str(cls, name: str) -> "IsobaricLabel": 133 | """ 134 | Converts a string representation of a quantification category to its corresponding 135 | QuantificationCategory enum member. 136 | 137 | Parameters: 138 | name (str): The name of the quantification category. 139 | 140 | Returns: 141 | QuantificationCategory: The corresponding enum member. 142 | 143 | Raises: 144 | KeyError: If the provided name does not match any quantification category. 145 | """ 146 | name_ = name.lower() 147 | for k, v in cls._member_map_.items(): 148 | if k.lower() == name_: 149 | return v 150 | raise KeyError(name) 151 | 152 | def channels(self) -> "IsobaricLabelSpec": 153 | """ 154 | Retrieves the channel specifications associated with the isobaric label. 155 | 156 | Returns: 157 | IsobaricLabelSpec: The channel specifications for the current isobaric label. 158 | """ 159 | return IsobaricLabelSpec.registry[self.name] 160 | 161 | 162 | @dataclass 163 | class IsobaricLabelSpec(Mapping[str, int]): 164 | """ 165 | A data class representing the specifications of isobaric labels, including their 166 | name and channel mappings. This class supports dictionary-like access to channel 167 | information and maintains a registry of all instances. 168 | 169 | Attributes: 170 | registry (ClassVar[dict[str, IsobaricLabelSpec]]): A class-level registry of all 171 | isobaric label specifications. 172 | name (str): The name of the isobaric label. 173 | channels (dict[str, int]): A mapping of channel names to their respective indices. 174 | 175 | Methods: 176 | __post_init__(): Registers the instance in the class-level registry. 177 | id: Retrieves the corresponding IsobaricLabel enum member for the label name. 178 | __getitem__(key: str) -> int: Returns the index of the specified channel. 179 | __iter__() -> Iterator[str]: Iterates over the channel names. 180 | __len__() -> int: Returns the number of channels. 181 | __contains__(key) -> bool: Checks if a channel name exists in the channels. 182 | """ 183 | 184 | registry: ClassVar[dict[str, "IsobaricLabelSpec"]] = {} 185 | 186 | name: str 187 | channels: dict[str, int] = field(default_factory=dict) 188 | 189 | def __post_init__(self): 190 | self.registry[self.name] = self 191 | 192 | @property 193 | def id(self): 194 | try: 195 | return IsobaricLabel[self.name] 196 | except ValueError: 197 | return None 198 | 199 | def __getitem__(self, key: str) -> int: 200 | return self.channels[key] 201 | 202 | def __iter__(self) -> Iterator[str]: 203 | yield from self.channels 204 | 205 | def __len__(self) -> int: 206 | return len(self.channels) 207 | 208 | def __contains__(self, key) -> bool: 209 | return key in self.channels 210 | 211 | 212 | TMT16plex = IsobaricLabelSpec( 213 | "TMT16plex", 214 | { 215 | "TMT126": 1, 216 | "TMT127N": 2, 217 | "TMT127C": 3, 218 | "TMT128N": 4, 219 | "TMT128C": 5, 220 | "TMT129N": 6, 221 | "TMT129C": 7, 222 | "TMT130N": 8, 223 | "TMT130C": 9, 224 | "TMT131N": 10, 225 | "TMT131C": 11, 226 | "TMT132N": 12, 227 | "TMT132C": 13, 228 | "TMT133N": 14, 229 | "TMT133C": 15, 230 | "TMT134N": 16, 231 | }, 232 | ) 233 | 234 | TMT11plex = IsobaricLabelSpec( 235 | "TMT11plex", 236 | { 237 | "TMT126": 1, 238 | "TMT127N": 2, 239 | "TMT127C": 3, 240 | "TMT128N": 4, 241 | "TMT128C": 5, 242 | "TMT129N": 6, 243 | "TMT129C": 7, 244 | "TMT130N": 8, 245 | "TMT130C": 9, 246 | "TMT131N": 10, 247 | "TMT131C": 11, 248 | }, 249 | ) 250 | 251 | TMT10plex = IsobaricLabelSpec( 252 | "TMT10plex", 253 | { 254 | "TMT126": 1, 255 | "TMT127N": 2, 256 | "TMT127C": 3, 257 | "TMT128N": 4, 258 | "TMT128C": 5, 259 | "TMT129N": 6, 260 | "TMT129C": 7, 261 | "TMT130N": 8, 262 | "TMT130C": 9, 263 | "TMT131": 10, 264 | }, 265 | ) 266 | 267 | TMT6plex = IsobaricLabelSpec( 268 | "TMT6plex", 269 | { 270 | "TMT126": 1, 271 | "TMT127": 2, 272 | "TMT128": 3, 273 | "TMT129": 4, 274 | "TMT130": 5, 275 | "TMT131": 6, 276 | }, 277 | ) 278 | 279 | ITRAQ4plex = IsobaricLabelSpec( 280 | "ITRAQ4plex", {"ITRAQ114": 1, "ITRAQ115": 2, "ITRAQ116": 3, "ITRAQ117": 4} 281 | ) 282 | 283 | ITRAQ8plex = IsobaricLabelSpec( 284 | "ITRAQ8plex", 285 | { 286 | "ITRAQ113": 1, 287 | "ITRAQ114": 2, 288 | "ITRAQ115": 3, 289 | "ITRAQ116": 4, 290 | "ITRAQ117": 5, 291 | "ITRAQ118": 6, 292 | "ITRAQ119": 7, 293 | "ITRAQ121": 8, 294 | }, 295 | ) 296 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "ibaqpy" 3 | description = "Python package to compute intensity base absolute expression values" 4 | readme = "README.md" 5 | license = "MIT" 6 | version = "0.0.5" 7 | authors = [ 8 | "Yasset Perez-Riverol ", 9 | "Dai Chengxin ", 10 | "Julianus Pfeuffer ", 11 | "Joshua Klein ", 12 | "Enrique Audain ", 13 | "Ping Zheng " 14 | ] 15 | keywords = [ 16 | "quantms", 17 | "proteomics", 18 | "mass-spectrometry", 19 | "data-analysis", 20 | "big data" 21 | ] 22 | classifiers = [ 23 | "Intended Audience :: Science/Research", 24 | "License :: OSI Approved :: MIT License", 25 | "Operating System :: OS Independent", 26 | "Programming Language :: Python :: 3 :: Only", 27 | "Topic :: Scientific/Engineering :: Bio-Informatics", 28 | "Development Status :: 5 - Production/Stable" 29 | ] 30 | packages = [ 31 | { include = "ibaqpy" } 32 | ] 33 | 34 | [tool.poetry.dependencies] 35 | python = ">=3.9" 36 | scikit-learn = "*" 37 | pyopenms = "*" 38 | numpy = "<2.1.0" 39 | click = "*" 40 | pandas = "*" 41 | matplotlib = "*" 42 | pyarrow = ">=16.1.0" 43 | duckdb = ">=0.10.1" 44 | qnorm = "*" 45 | scipy = ">=1.10" 46 | seaborn = ">=0.13.2" 47 | typing_extensions = ">=4.6.3" 48 | inmoose = "*" 49 | 50 | [tool.poetry.urls] 51 | GitHub = "https://github.com/bigbio/ibaqpy/" 52 | PyPi = "https://pypi.org/project/ibaqpy/" 53 | Quantms = "https://quantms.org" 54 | LICENSE = "https://github.com/bigbio/ibaqpy/blob/main/LICENSE" 55 | 56 | [tool.poetry.scripts] 57 | ibaqpyc = "ibaqpy.ibaqpyc:main" 58 | 59 | [tool.isort] 60 | profile = "black" 61 | 62 | [tool.black] 63 | line-length = 99 64 | target-version = ["py39"] 65 | 66 | [build-system] 67 | requires = ["poetry-core>=1.2.0"] 68 | build-backend = "poetry.core.masonry.api" -------------------------------------------------------------------------------- /qodana.yaml: -------------------------------------------------------------------------------- 1 | version: "1.0" 2 | linter: jetbrains/qodana-jvm:2024.2 3 | profile: 4 | name: qodana.recommended 5 | include: 6 | - name: CheckDependencyLicenses 7 | -------------------------------------------------------------------------------- /recipe/conda_build_config.yaml: -------------------------------------------------------------------------------- 1 | python: 2 | - 3.11 3 | -------------------------------------------------------------------------------- /recipe/meta.yaml: -------------------------------------------------------------------------------- 1 | # recipe/meta.yaml 2 | package: 3 | name: ibaqpy 4 | version: "0.0.5" 5 | 6 | source: 7 | path: ../ 8 | 9 | build: 10 | entry_points: 11 | - ibaqpyc=ibaqpy.ibaqpyc:main 12 | run_exports: 13 | - {{ pin_subpackage('ibaqpy', max_pin="x.x") }} 14 | script: "{{ PYTHON }} -m pip install . --no-deps --no-build-isolation --no-cache-dir -vvv" 15 | number: 0 16 | noarch: python 17 | 18 | requirements: 19 | host: 20 | - python 21 | - pip 22 | - poetry-core >=1.2.0 23 | run: 24 | - python>=3.9 25 | - scikit-learn 26 | - pyopenms 27 | - numpy<2.1.0 28 | - click 29 | - pandas 30 | - matplotlib 31 | - pyarrow>=16.1.0 32 | - duckdb>=0.10.1 33 | - qnorm 34 | - scipy>=1.10 35 | - seaborn>=0.13.2 36 | - typing_extensions>=4.6.3 37 | - inmoose 38 | test: 39 | imports: 40 | - ibaqpy 41 | commands: 42 | - ibaqpyc --help 43 | 44 | about: 45 | home: https://www.github.com/bigbio/ibaqpy 46 | summary: Python package to compute intensity base absolute expression values 47 | license: MIT 48 | license_file: LICENSE 49 | dev_url: https://www.github.com/bigbio/ibaqpy 50 | 51 | extra: 52 | recipe-maintainers: 53 | - ypriverol -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn~=1.6.1 2 | pyopenms 3 | numpy<2.1.0 4 | click~=8.1.3 5 | pandas~=2.0.1 6 | matplotlib~=3.7.1 7 | pyarrow>=16.1.0 8 | duckdb>=0.10.1 9 | qnorm 10 | scipy>=1.10 11 | seaborn>=0.13.2 12 | typing_extensions>=4.6.3 13 | inmoose 14 | pytest~=8.3.4 15 | anndata~=0.10.9 -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/tests/__init__.py -------------------------------------------------------------------------------- /tests/example/feature.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/tests/example/feature.parquet -------------------------------------------------------------------------------- /tests/example/out/.gitignore: -------------------------------------------------------------------------------- 1 | PXD* 2 | *.pdf -------------------------------------------------------------------------------- /tests/test_batch_correction.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | import pytest 4 | import anndata 5 | 6 | import pandas as pd 7 | 8 | from ibaqpy.commands.correct_batches import run_batch_correction 9 | from ibaqpy.ibaq.ibaqpy_commons import SAMPLE_ID, PROTEIN_NAME, IBAQ, IBAQ_BEC 10 | 11 | TESTS_DIR = Path(__file__).parent 12 | 13 | logger = logging.getLogger(__name__) 14 | logger.addHandler(logging.NullHandler()) 15 | 16 | 17 | def test_correct_batches(): 18 | """ 19 | Test the `run_batch_correction` function to ensure it correctly processes iBAQ values 20 | from TSV files, generates the expected output files, and handles various error cases. 21 | 22 | This test verifies: 23 | - The creation and non-emptiness of the corrected output TSV file. 24 | - The creation and correct shape of the AnnData object. 25 | - Handling of invalid sample IDs by raising a ValueError. 26 | - Handling of missing required columns by raising a ValueError. 27 | - Handling of invalid file patterns by raising a ValueError. 28 | """ 29 | args = { 30 | "folder": TESTS_DIR / "ibaq-raw-hela", 31 | "pattern": "*ibaq.tsv", 32 | "comment": "#", 33 | "sep": "\t", 34 | "output": TESTS_DIR / "example/ibaq_corrected_combined.tsv", 35 | "sample_id_column": SAMPLE_ID, 36 | "protein_id_column": PROTEIN_NAME, 37 | "ibaq_raw_column": IBAQ, 38 | "ibaq_corrected_column": IBAQ_BEC, 39 | "export_anndata": True, 40 | } 41 | logging.debug("Arguments for run_batch_correction: %s", args) 42 | run_batch_correction(**args) 43 | 44 | # Assert the output file is created and not empty 45 | output_path = Path(args["output"]) 46 | assert output_path.exists(), f"Expected output file {output_path} was not created." 47 | df = pd.read_csv(output_path, sep=args["sep"]) 48 | assert not df.empty, "The corrected output file is empty." 49 | 50 | # Assert the AnnData object is created 51 | adata_path = output_path.with_suffix(".h5ad") 52 | assert adata_path.exists(), f"Expected AnnData file {adata_path} was not created." 53 | 54 | # Read the AnnData object and check shape and layers 55 | adata = anndata.read_h5ad(adata_path) 56 | logger.info(adata) 57 | assert adata.shape == (46, 3476) 58 | assert adata.layers[IBAQ_BEC].shape == (46, 3476) 59 | 60 | # Test invalid sample IDs 61 | with pytest.raises(ValueError): 62 | args["folder"] = TESTS_DIR / "invalid-samples" 63 | run_batch_correction(**args) 64 | 65 | # Test missing required columns 66 | with pytest.raises(ValueError): 67 | args["folder"] = TESTS_DIR / "ibaq-raw-hela" 68 | args["sample_id_column"] = "NonexistentColumn" 69 | run_batch_correction(**args) 70 | 71 | # Test invalid file pattern 72 | with pytest.raises(ValueError): 73 | args["pattern"] = "nonexistent*.tsv" 74 | run_batch_correction(**args) 75 | 76 | 77 | if __name__ == "__main__": 78 | test_correct_batches() 79 | -------------------------------------------------------------------------------- /tests/test_file_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | 4 | import pandas as pd 5 | 6 | from ibaqpy.ibaq.file_utils import create_anndata, combine_ibaq_tsv_files 7 | from ibaqpy.ibaq.ibaqpy_commons import ( 8 | SAMPLE_ID, 9 | PROTEIN_NAME, 10 | IBAQ, 11 | IBAQ_NORMALIZED, 12 | IBAQ_LOG, 13 | ) 14 | 15 | TESTS_DIR = Path(__file__).parent 16 | 17 | 18 | def test_combine_ibaq_tsv_files(): 19 | """ 20 | Test functions for combining iBAQ TSV files and creating AnnData objects. 21 | 22 | Functions: 23 | - test_combine_ibaq_tsv_files: Tests the combination of multiple iBAQ TSV files 24 | into a single DataFrame and verifies the shape of the resulting DataFrame. 25 | - test_create_anndata: Tests the creation of an AnnData object from a DataFrame 26 | with specified observation and variable columns, additional layers, and metadata. 27 | """ 28 | ibaq_dir = TESTS_DIR / "ibaq-raw-hela" 29 | files_pattern = "*ibaq.tsv" 30 | df_ibaq = combine_ibaq_tsv_files(dir_path=str(ibaq_dir), pattern=files_pattern, sep="\t") 31 | logging.info(df_ibaq.head()) 32 | assert df_ibaq.shape == (83725, 14) 33 | 34 | 35 | def test_create_anndata(): 36 | """ 37 | Test functions for combining iBAQ TSV files and creating AnnData objects. 38 | 39 | Functions: 40 | - test_combine_ibaq_tsv_files: Tests the combination of multiple iBAQ TSV files 41 | into a single DataFrame and verifies the shape of the resulting DataFrame. 42 | - test_create_anndata: Tests the creation of an AnnData object from a DataFrame 43 | with specified observation and variable columns, additional layers, and metadata. 44 | """ 45 | df = pd.read_csv(TESTS_DIR / "ibaq-raw-hela/PXD000396.ibaq.tsv", sep="\t") 46 | obs_col = SAMPLE_ID 47 | var_col = PROTEIN_NAME 48 | value_col = IBAQ 49 | layers = [IBAQ_NORMALIZED, IBAQ_LOG] 50 | adata = create_anndata( 51 | df=df, 52 | obs_col=obs_col, 53 | var_col=var_col, 54 | value_col=value_col, 55 | layer_cols=layers, 56 | obs_metadata_cols=["Condition"], 57 | var_metadata_cols=[], 58 | ) 59 | logging.info(adata) 60 | assert adata.shape == (12, 3096) 61 | assert adata.layers[IBAQ_NORMALIZED].shape == (12, 3096) 62 | assert adata.layers[IBAQ_LOG].shape == (12, 3096) 63 | assert "HeLa" in adata.obs["Condition"].values 64 | -------------------------------------------------------------------------------- /tests/test_ibaqpy.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from ibaqpy.ibaq.peptides2protein import peptides_to_protein 4 | 5 | from pathlib import Path 6 | 7 | TESTS_DIR = Path(__file__).parent 8 | 9 | logger = logging.getLogger(__name__) 10 | logger.addHandler(logging.NullHandler()) 11 | 12 | 13 | def test_ibaq_compute(): 14 | """ 15 | Test the computation of IBAQ values using the peptides_to_protein function. 16 | 17 | This test sets up the necessary arguments, including paths to input files, 18 | enzyme type, normalization options, and output paths, and then calls the 19 | peptides_to_protein function to perform the computation. It prints the 20 | arguments for verification before execution. 21 | 22 | The test uses example data files located in the 'example' directory and 23 | outputs results to the 'out' directory. 24 | """ 25 | args = { 26 | "fasta": str( 27 | TESTS_DIR / "example/Homo-sapiens-uniprot-reviewed-contaminants-decoy-202210.fasta" 28 | ), 29 | "peptides": str(TESTS_DIR / "example/PXD017834-peptides.csv"), 30 | "enzyme": "Trypsin", 31 | "normalize": True, 32 | "min_aa": 7, 33 | "max_aa": 30, 34 | "tpa": True, 35 | "ruler": True, 36 | "ploidy": 2, 37 | "cpc": 200, 38 | "organism": "human", 39 | "output": str(TESTS_DIR / "example" / "out" / "PXD017834-ibaq.tsv"), 40 | "verbose": True, 41 | "qc_report": str(TESTS_DIR / "example/out/QCprofile.pdf"), 42 | } 43 | logger.info(args) 44 | peptides_to_protein(**args) 45 | -------------------------------------------------------------------------------- /tests/test_ibaqpy_postprocessing.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pandas as pd 3 | 4 | from ibaqpy.ibaq.ibaqpy_commons import SAMPLE_ID, IBAQ_NORMALIZED 5 | from ibaqpy.ibaq.ibaqpy_postprocessing import ( 6 | remove_samples_low_protein_number, 7 | remove_missing_values, 8 | describe_expression_metrics, 9 | ) 10 | import logging 11 | 12 | TESTS_DIR = Path(__file__).parent 13 | 14 | 15 | def test_remove_samples_low_protein_number(): 16 | """ 17 | Test functions for post-processing iBAQ data. 18 | 19 | These tests validate the functionality of the following operations: 20 | - Removing samples with a low number of proteins. 21 | - Removing samples with a high percentage of missing values. 22 | - Describing expression metrics across samples. 23 | 24 | Each test reads a sample iBAQ dataset, applies the respective function, 25 | and logs the number of samples before and after processing. 26 | """ 27 | ibaq_test = TESTS_DIR / "example/PXD017834-example-ibaq.tsv" 28 | ibaq_df = pd.read_csv(ibaq_test, sep="\t") 29 | number_samples = len(ibaq_df[SAMPLE_ID].unique()) 30 | logging.info("The number of samples in the dataframe {}".format(number_samples)) 31 | 32 | new_ibaq = remove_samples_low_protein_number(ibaq_df, min_protein_num=286) 33 | 34 | number_samples = len(new_ibaq[SAMPLE_ID].unique()) 35 | logging.info( 36 | "The number of samples with number of proteins higher than 286 is {}".format( 37 | number_samples 38 | ) 39 | ) 40 | 41 | 42 | def test_remove_missing_values(): 43 | """ 44 | Test functions for post-processing iBAQ data. 45 | 46 | These tests validate the functionality of the following operations: 47 | - Removing samples with a low number of proteins. 48 | - Removing samples with a high percentage of missing values. 49 | - Describing expression metrics across samples. 50 | 51 | Each test reads a sample iBAQ dataset, applies the respective function, 52 | and logs the number of samples before and after processing. 53 | """ 54 | ibaq_test = TESTS_DIR / "example/PXD017834-example-ibaq.tsv" 55 | ibaq_df = pd.read_csv(ibaq_test, sep="\t") 56 | number_samples = len(ibaq_df[SAMPLE_ID].unique()) 57 | logging.info("The number of samples in the dataframe {}".format(number_samples)) 58 | new_ibaq = remove_missing_values( 59 | ibaq_df, missingness_percentage=1, expression_column=IBAQ_NORMALIZED 60 | ) 61 | number_samples = len(new_ibaq[SAMPLE_ID].unique()) 62 | logging.info( 63 | "The number of samples with less than 1% of missing values is {}".format(number_samples) 64 | ) 65 | 66 | 67 | def test_describe_expression_metrics(): 68 | """ 69 | Test functions for post-processing iBAQ data. 70 | 71 | These tests validate the functionality of the following operations: 72 | - Removing samples with a low number of proteins. 73 | - Removing samples with a high percentage of missing values. 74 | - Describing expression metrics across samples. 75 | 76 | Each test reads a sample iBAQ dataset, applies the respective function, 77 | and logs the number of samples before and after processing. 78 | """ 79 | ibaq_test = TESTS_DIR / "example/PXD017834-example-ibaq.tsv" 80 | ibaq_df = pd.read_csv(ibaq_test, sep="\t") 81 | 82 | metrics = describe_expression_metrics(ibaq_df) 83 | logging.info(metrics) 84 | -------------------------------------------------------------------------------- /tests/test_peptide_normalize.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from ibaqpy.ibaq.peptide_normalization import peptide_normalization 4 | from pathlib import Path 5 | 6 | TESTS_DIR = Path(__file__).parent 7 | 8 | logger = logging.getLogger(__name__) 9 | logger.addHandler(logging.NullHandler()) 10 | 11 | 12 | def test_feature_assembly(): 13 | """ 14 | Test the peptide normalization process by setting up arguments for the 15 | `peptide_normalization` function and executing it. This test checks the 16 | function's ability to process a feature table from a parquet file and an 17 | SDRF file, applying various filtering and normalization steps, and saving 18 | the output to a CSV file. It ensures that the output file is removed before 19 | the test to avoid conflicts. 20 | 21 | The test uses the following parameters: 22 | - parquet: Path to the input parquet file containing feature data. 23 | - sdrf: Path to the SDRF file for experimental metadata. 24 | - min_aa: Minimum number of amino acids required for peptides. 25 | - min_unique: Minimum number of unique peptides required for proteins. 26 | - remove_ids: Path to a file with protein IDs to remove, if any. 27 | - remove_decoy_contaminants: Flag to remove decoy and contaminant proteins. 28 | - remove_low_frequency_peptides: Flag to remove low-frequency peptides. 29 | - output: Path to the output CSV file for normalized peptide intensities. 30 | - skip_normalization: Flag to skip the normalization process. 31 | - nmethod: Method for feature-level normalization. 32 | - pnmethod: Method for peptide-level normalization. 33 | - log2: Flag to apply log2 transformation to intensities. 34 | - save_parquet: Flag to save the output as a parquet file. 35 | """ 36 | 37 | args = { 38 | "parquet": str(TESTS_DIR / "example/feature.parquet"), 39 | "sdrf": str(TESTS_DIR / "example/PXD017834-TMT.sdrf.tsv"), 40 | "min_aa": 7, 41 | "min_unique": 2, 42 | "remove_ids": None, 43 | "remove_decoy_contaminants": True, 44 | "remove_low_frequency_peptides": True, 45 | "output": str(TESTS_DIR / "example" / "out" / "PXD017834-peptides-norm.csv"), 46 | "skip_normalization": False, 47 | "nmethod": "median", 48 | "pnmethod": "none", 49 | "log2": True, 50 | "save_parquet": True, 51 | } 52 | logger.info(args) 53 | out = Path(args["output"]) 54 | if out.exists(): 55 | out.unlink() 56 | peptide_normalization(**args) 57 | --------------------------------------------------------------------------------