├── .github
    └── workflows
    │   ├── conda-build.yml
    │   ├── python-app.yml
    │   ├── python-package.yml
    │   └── python-publish.yml
├── .gitignore
├── CITATION.cff
├── LICENSE
├── MANIFEST.in
├── README.md
├── benchmarks
    ├── README.md
    ├── ibaqpy-batch-correction-example.html
    ├── ibaqpy-batch-correction-example.ipynb
    └── images
    │   ├── 9_tissues-boxplot.png
    │   ├── 9_tissues-density.png
    │   ├── PXD007683-11samples-density.png
    │   ├── PXD007683-LFQ-11samples-ibaq-ibaqpy-and-maxquant.png
    │   ├── PXD007683-LFQ-11samples-ibaq-vs-maxquant-density.png
    │   ├── PXD007683-LFQ-11samples-no_cov.png
    │   ├── PXD007683-LFQ-ibaq-ibaqpy-and-maxquant.png
    │   ├── PXD007683-LFQ-ibaq-vs-maxquant-density.png
    │   ├── PXD007683-LFQ-no_cov.png
    │   ├── PXD007683-TMTvsLFQ-boxplot.png
    │   ├── PXD007683-TMTvsLFQ-density.png
    │   ├── PXD019909-11samples-density.png
    │   ├── PXD019909-TMTvsLFQ-density.png
    │   ├── fold_change_lfq.png
    │   ├── fold_change_tmt.png
    │   ├── method_mean_cv_016999_lfq.png
    │   ├── method_mean_cv_lfq.png
    │   ├── method_mean_cv_tmt.png
    │   ├── method_per_p_cv_016999_lfq.png
    │   ├── method_per_p_cv_lfq.png
    │   ├── method_per_p_cv_tmt.png
    │   ├── missing_peptides_by_sample.png
    │   ├── missing_value_016999_lfq.png
    │   └── per_protein_cv.png
├── data
    ├── __init__.py
    ├── contaminants_ids.tsv
    ├── high_abundant_proteins.tsv
    ├── histones.json
    ├── ibaqpy.drawio
    └── ibaqpy.drawio.png
├── environment.yaml
├── ibaqpy
    ├── __init__.py
    ├── commands
    │   ├── __init__.py
    │   ├── correct_batches.py
    │   ├── features2peptides.py
    │   ├── peptides2protein.py
    │   └── tsne_visualization.py
    ├── data
    │   ├── __init__.py
    │   ├── data.py
    │   └── organisms.json
    ├── ibaq
    │   ├── __init__.py
    │   ├── combiner.py
    │   ├── file_utils.py
    │   ├── ibaqpy_commons.py
    │   ├── ibaqpy_postprocessing.py
    │   ├── imputation_methods.py
    │   ├── logger.py
    │   ├── logging_config.py
    │   ├── peptide_normalization.py
    │   ├── peptides2protein.py
    │   ├── utils.py
    │   └── write_queue.py
    ├── ibaqpyc.py
    └── model
    │   ├── __init__.py
    │   ├── normalization.py
    │   ├── organism_metadata.py
    │   └── quantification_type.py
├── pyproject.toml
├── qodana.yaml
├── recipe
    ├── conda_build_config.yaml
    └── meta.yaml
├── requirements.txt
└── tests
    ├── __init__.py
    ├── example
        ├── Homo-sapiens-uniprot-reviewed-contaminants-decoy-202210.fasta
        ├── PXD017834-TMT.sdrf.tsv
        ├── PXD017834-example-ibaq.tsv
        ├── PXD017834-peptides.csv
        ├── feature.parquet
        └── out
        │   └── .gitignore
    ├── ibaq-raw-hela
        ├── PXD000396.ibaq.tsv
        ├── PXD005481.ibaq.tsv
        └── PXD039414.ibaq.tsv
    ├── test_batch_correction.py
    ├── test_file_utils.py
    ├── test_ibaqpy.py
    ├── test_ibaqpy_postprocessing.py
    └── test_peptide_normalize.py


/.github/workflows/conda-build.yml:
--------------------------------------------------------------------------------
 1 | name: Conda Build
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 |     defaults:
 9 |       run:
10 |         shell: bash -el {0}
11 |     steps:
12 |       - name: Checkout repository
13 |         uses: actions/checkout@v4 # Update to latest version
14 | 
15 |       - name: Set up Miniconda
16 |         uses: conda-incubator/setup-miniconda@v3
17 |         with:
18 |           mamba-version: "*"
19 |           channels: conda-forge,bioconda
20 |           cache-downloads: true
21 |           auto-update-conda: false
22 |           activate-environment: test
23 |           python-version: "3.12"
24 | 
25 |       - name: Setup conda-build and anaconda-client
26 |         run: |
27 |           mamba install -q conda-build anaconda-client conda-verify
28 | 
29 |       - name: Build package
30 |         run: |
31 |           conda build purge-all
32 |           conda config --set solver libmamba
33 |           conda config --set channel_priority strict
34 |           conda build recipe --suppress-variables --override-channels --channel conda-forge --channel bioconda --no-anaconda-upload --output-folder ./
35 | 


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Python application
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: ["main", "dev"]
 9 |   pull_request:
10 |     branches: ["main", "dev"]
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   build:
17 |     runs-on: ubuntu-latest
18 |     strategy:
19 |       fail-fast: false
20 |       matrix:
21 |         python-version: ["3.9", "3.10", "3.11"]
22 | 
23 |     steps:
24 |       - uses: actions/checkout@v4
25 |       - name: Set up Python ${{ matrix.python-version }}
26 |         uses: actions/setup-python@v3
27 |         with:
28 |           python-version: ${{ matrix.python-version }}
29 |       - name: Install dependencies
30 |         run: |
31 |           python -m pip install --upgrade pip
32 |           python -m pip install flake8 pytest
33 |           pip install poetry
34 |           poetry build
35 |           pip install dist/*.whl
36 |       - name: Lint with flake8
37 |         run: |
38 |           # stop the build if there are Python syntax errors or undefined names
39 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
40 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
41 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
42 |       - name: Test with pytest
43 |         run: |
44 |           poetry run pytest
45 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: ["main", "dev"]
 9 |   pull_request:
10 |     branches: ["main", "dev"]
11 | 
12 | jobs:
13 |   build:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       fail-fast: false
17 |       matrix:
18 |         python-version: ["3.9", "3.10", "3.11"]
19 | 
20 |     steps:
21 |       - uses: actions/checkout@v4
22 |       - name: Set up Python ${{ matrix.python-version }}
23 |         uses: actions/setup-python@v3
24 |         with:
25 |           python-version: ${{ matrix.python-version }}
26 |       - name: Install dependencies
27 |         run: |
28 |           python -m pip install --upgrade pip
29 |           python -m pip install flake8 pytest
30 |           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
31 |           pip install poetry
32 |           poetry build
33 |           pip install dist/*.whl
34 |       - name: Lint with flake8
35 |         run: |
36 |           # stop the build if there are Python syntax errors or undefined names
37 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
38 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
39 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
40 |       - name: Test with pytest
41 |         run: |
42 |           pytest
43 |       - name: Test commandline tool
44 |         run: |
45 |           ibaqpyc --help
46 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 |     runs-on: ubuntu-latest
21 | 
22 |     steps:
23 |       - uses: actions/checkout@v4
24 |       - name: Set up Python
25 |         uses: actions/setup-python@v3
26 |         with:
27 |           python-version: "3.x"
28 |       - name: Install dependencies
29 |         run: |
30 |           python -m pip install --upgrade pip
31 |           pip install build
32 |       - name: Build package
33 |         run: python -m build
34 |       - name: Publish package
35 |         uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
36 |         with:
37 |           user: __token__
38 |           password: ${{ secrets.PYPI_API_TOKEN }}
39 |           verbose: true
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | res.csv
 3 | /data/PXD004682-Peptide-Intensities.tsv
 4 | /data/PXD008934.sdrf.tsv.gz
 5 | /data/PXD008934-out.mzTab.gz
 6 | /data/PXD008934-out_msstats.csv.gz
 7 | /data/PXD008934-out_triqler.tsv.gz
 8 | venv
 9 | /venv/
10 | /compute-all.sh
11 | /ibaqpy.egg-info/
12 | /ibaqpy_temp/
13 | /tests/PXD003947/IBAQ-QCprofile.pdf
14 | /tests/PXD003947/PXD003947-ibaq-norm.csv
15 | /tests/PXD003947/PXD003947-peptides-norm.csv
16 | /tests/PXD003947/PXD003947-peptides-norm.parquet
17 | /build/
18 | /dist/
19 | /**/__pycache__/
20 | .qodo
21 | /.vscode/
22 | /tests/example/ibaq_corrected_combined.h5ad
23 | /tests/example/ibaq_corrected_combined.tsv
24 | /tests/example/PXD017834-ibaq.tsv
25 | /tests/example/PXD017834-peptides-norm.csv
26 | /tests/example/PXD017834-peptides-norm.parquet
27 | /tests/example/QCprofile.pdf
28 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use ibaqpy in your research, please cite this work."
 3 | title: "ibaqpy: A scalable Python package for baseline quantification in proteomics leveraging SDRF metadata"
 4 | authors:
 5 |   - family-names: "Zheng"
 6 |     given-names: "Ping"
 7 |   - family-names: "Audain"
 8 |     given-names: "Enrique"
 9 |   - family-names: "Webel"
10 |     given-names: "Henry"
11 |   - family-names: "Dai"
12 |     given-names: "Chengxin"
13 |   - family-names: "Klein"
14 |     given-names: "Joshua"
15 |   - family-names: "Hitz"
16 |     given-names: "Marc-Phillip"
17 |   - family-names: "Sachsenberg"
18 |     given-names: "Timo"
19 |   - family-names: "Bai"
20 |     given-names: "Mingze"
21 |   - family-names: "Perez-Riverol"
22 |     given-names: "Yasset"
23 | abstract: "Intensity-based absolute quantification (iBAQ) is essential in proteomics as it allows for the assessment of a protein's absolute abundance in various samples or conditions. However, the computation of these values for increasingly large-scale and high-throughput experiments, such as those using DIA, TMT, or LFQ workflows, poses significant challenges in scalability and reproducibility. Here, we present ibaqpy, a Python package designed to compute iBAQ values efficiently for experiments of any scale."
24 | date-released: "2025-02-08"
25 | doi: "10.1101/2025.02.08.637208"
26 | url: "https://www.biorxiv.org/content/early/2025/02/08/2025.02.08.637208"
27 | journal: "bioRxiv"
28 | publisher: "Cold Spring Harbor Laboratory"
29 | version: "2025.02.08.637208"
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 BigBio Team
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include ibaqpy/data/ *


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ibaqpy
  2 | 
  3 | [![Python application](https://github.com/bigbio/ibaqpy/actions/workflows/python-app.yml/badge.svg)](https://github.com/bigbio/ibaqpy/actions/workflows/python-app.yml)
  4 | [![Upload Python Package](https://github.com/bigbio/ibaqpy/actions/workflows/python-publish.yml/badge.svg)](https://github.com/bigbio/ibaqpy/actions/workflows/python-publish.yml)
  5 | [![Codacy Badge](https://app.codacy.com/project/badge/Grade/6a1961c7d57c4225b4891f73d58cac6b)](https://app.codacy.com/gh/bigbio/ibaqpy/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade)
  6 | [![PyPI version](https://badge.fury.io/py/ibaqpy.svg)](https://badge.fury.io/py/ibaqpy)
  7 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/ibaqpy)
  8 | 
  9 | iBAQ (Intensity-Based Absolute Quantification) determines the abundance of a protein by dividing the total precursor intensities by the number of theoretically observable peptides of the protein [manuscript here](https://pubmed.ncbi.nlm.nih.gov/16219938/). ibaqpy is a Python package that computes iBAQ values starting from a feature parquet from [quantmsio](https://github.com/bigbio/quantms.io) and a [SDRF](https://github.com/bigbio/proteomics-sample-metadata) file. In addition, the package computes other ibaq values including rIBAQ, log2, and ppb.
 10 | 
 11 | ibaqpy also allows computing the TPA value (Total Protein Approach), protein copy number, and protein concentration. TPA is determined by summing peptide intensities of each protein and then dividing by the molecular mass to determine the relative concentration of each protein. By using [ProteomicRuler](https://www.sciencedirect.com/science/article/pii/S1535947620337749), it is possible to calculate the protein copy number and absolute concentration. The OpenMS tool was used to calculate the theoretical molecular mass of each protein. Similar to the calculation of IBAQ, the TPA value of protein-group was the sum of its intensity divided by the sum of the theoretical molecular mass.
 12 | 
 13 | The protein copy calculation follows the following formula:
 14 | 
 15 | ```
 16 | protein copies per cell = protein MS-signal *  (avogadro / molecular mass) * (DNA mass / histone MS-signal)
 17 | ```
 18 | 
 19 | For cellular protein copy number calculation, the uniprot accession of histones was obtained from species first, and the molecular mass of DNA was calculated. Then the dataframe was grouped according to different conditions, and the copy number, molar number and mass of proteins were calculated. In the calculation of protein concentration, the volume is calculated according to the cell protein concentration first, and then the protein mass is divided by the volume to calculate the intracellular protein concentration.
 20 | 
 21 | ### Overview of ibaq-base values computation
 22 | 
 23 | As mentioned before, ibaq values are calculated by dividing the total precursor intensities by the number of theoretically observable peptides of the protein. We use the following steps to calculate the iBAQ values:
 24 | 
 25 | - _Observable peptides_, the protein sequence is digested in silico using a specific enzyme. The current version of this tool uses OpenMS method to load fasta file, and use [ProteaseDigestion](https://openms.de/current_doxygen/html/classOpenMS_1_1ProteaseDigestion.html) to enzyme digestion of protein sequences, and finally get the theoretical peptide number of each protein.
 26 | 
 27 | - _Total precursor intensities_, the total intensity of a protein is calculated by summing the intensity of all peptides that belong to the protein. The intensity values are obtained from the feature parquet file in [quantms.io](https://github.com/bigbio/quantms.io).
 28 | 
 29 | > Note: If protein-group exists in the peptide intensity dataframe, the intensity of all proteins in the protein-group is summed based on the above steps, and then divided by the number of proteins in the protein-group.
 30 | 
 31 | ### Other values
 32 | 
 33 | - `Ibaq` - the iBAQ value is calculated as `Total precursor intensities / Number of observable peptides`
 34 | 
 35 | - `IbaqNorm` - normalize the ibaq values using the total ibaq of the sample `ibaq / sum(ibaq)`, the sum is applied for proteins in the same _sample + condition_.
 36 | 
 37 | - `IbaqLog` - The ibaq log is calculated as `10 + log10(IbaqNorm)`. This normalized ibaq value was developed [by ProteomicsDB Team](https://academic.oup.com/nar/article/46/D1/D1271/4584631).
 38 | 
 39 | - `IbaqPpb` - The resulted IbaqNorm is multiplied by 100M `IbaqNorm * 100'000'000`. This method was developed originally [by PRIDE Team](https://www.nature.com/articles/s41597-021-00890-2).
 40 | 
 41 | - `IbaqBec` - Ibaq after Batch effect correction using combat-norm algorithm in inmoose package.
 42 | 
 43 | - `TPA` - TPA value is calculated as `NormIntensity / MolecularWeight`
 44 | 
 45 | - `CopyNumber` - Protein copy number is calculated by a proteomic ruler approach.
 46 | 
 47 | - `Concentration[nM]` - Protein concentration is calculated using the total weight and a provided concentration per cell (cpc).
 48 | 
 49 | ### From quantms to Ibaq values
 50 | 
 51 | ![Ibaq](./data/ibaqpy.drawio.png "IBAQ")
 52 | 
 53 | The output of quantms is converted into quantms.io feature file. quantms.io provides a unified format for processing report files, including peptide intensity information. In quantms.io, you can use the `convert-ibaq` command, providing a **feature file** and an **SDRF file**, to inject experimental information into the feature file, generating an ibaqpy use case.
 54 | 
 55 | ```asciidoc
 56 | >$ quantmsioc convert-feature --sdrf_file PXD004452-Hella-trypsin.sdrf.tsv --msstats_file PXD004452-Hella-trypsin.sdrf_openms_design_msstats_in.csv --mztab_file PXD004452-Hella-trypsin.sdrf_openms_design_openms.mzTab --file_num 30 --output_folder res --duckdb_max_memory 64GB --output_prefix_file PXD004452
 57 | >$ quantmsioc convert-ibaq --feature_file res/PXD004452-6c224f5a-7c1f-46f9-9dae-1541baeef8fe.feature.parquet --sdrf_file PXD004452-Hella-trypsin.sdrf.tsv --output_folder ibaq --output_prefix_file PXD004452
 58 | ```
 59 | 
 60 | A feature in quantms.io is the combination of the following columns:
 61 | 
 62 | - `ProteinName`: Protein name
 63 | - `Peptidoform`: Peptide sequence including post-translation modifications `(e.g. .(Acetyl)ASPDWGYDDKN(Deamidated)GPEQWSK)`
 64 | - `PEPTIDE_CANONICAL`: Canonical peptide sequence
 65 | - `PrecursorCharge`: Precursor charge
 66 | - `Channel`: Lable channel
 67 | - `Condition`: Condition label `(e.g. heart)`
 68 | - `BioReplicate`: Biological replicate index `(e.g. 1)`
 69 | - `Run`: Run index `(e.g. 1)`
 70 | - `Fraction`: Fraction index `(e.g. 1)`
 71 | - `Intensity`: Peptide intensity
 72 | - `Reference`: reference file
 73 | - `SampleID`: Sample ID `(e.g. PXD003947-Sample-3)`
 74 | 
 75 | In summary, each feature is the unique combination of a peptide sequence including modifications (peptidoform), precursor charge state, condition, biological replicate, run, fraction, reference_file_name, sample_accession, and a given intensity. In order to go from these features into protein ibaq values, the package does the following:
 76 | 
 77 | #### Data preprocessing
 78 | 
 79 | In this section`features2peptides`, ibaqpy will do:
 80 | 
 81 | - Parse the identifier of proteins and retain only unique peptides.
 82 | - Remove lines where intensity or study condition is empty: This could happen in the following cases:
 83 |   - The DIA pipeline sometimes for some features releases intensities with value 0.
 84 |   - The quantms.io do not contain feature information for some conditions. This extreme case could happen when not ID/Quant was found for a given condition during the analysis.
 85 | - Filter peptides with less amino acids than min_aa.
 86 | - Low-confidence proteins were removed according to the threshold of unique peptides: We use a thershold of 2 unique peptides to consider a protein as a high-confidence protein. This parameter is applied if not specified by the user, and the default value is 2. If users want to change this threshold, they can use the `--min_unique` parameter.
 87 | - Filter decoy, contaminants, entrapment: Proteins with the following prefix are removed by default: `DECOY, CONTAMINANT, ENTRAPMENT` could be removed, by default, the filter is not applied. If users want to remove these proteins, they can use the `--remove_decoy_contaminants` parameter.
 88 | - Filter user-specified proteins: The user can provide a list of protein identifiers to remove from the analysis using the `--remove_ids` parameter. The remove ids parameters will remove proteins from the analysis that could be potential to influence the intensity normalization. For example, ALBU_HUMAN could be over expressed in human tissues, and that is why we may want to remove it when analyzing tissue data.
 89 | - Normalize at feature level between ms runs (technical repetitions):
 90 |   - When `MS runs > 1` in the sample, the `mean` of all average(`mean`, `median` or `iqr`) in each MS run is calculated(SampleMean)
 91 |   - The ratio between SampleMean and the average MS run is used as a reference to scale the original intensity
 92 | - Merge peptidoforms across fractions and technical repetitions: Combine technical replicates and fragments from the same sample.
 93 | - Normalize the data at the sample level:
 94 |   - `globalMedian`: A global median that adjusts the median of all samples.
 95 |   - `conditionMedian`: All samples under the same conditions were adjusted to the median value under the current conditions.
 96 | - Remove peptides with low frequency if `sample number > 1`: This parameter is applied always unless the user specifies the `--remove_low_frequency_peptides` parameter. The default value is 20% of the samples. If users want to change this threshold, they can use the `--remove_low_frequency_peptides` parameter.
 97 | - Assembly peptidoforms to peptides:
 98 |   A peptidoform is a combination of a `PeptideSequence(Modifications) + Charge + BioReplicate + Fraction` (among other features), and a peptide is a combination of a `PeptideSequence(Canonical) + BioReplicate`. ibaqpy will do:
 99 |   - Select peptidoforms with the highest intensity across different modifications, fractions, and technical replicates
100 |   - Merge peptidoforms across different charges and combined into peptides. In order to merge peptidoforms, the package will applied the `sum` of the intensity values of the peptidoforms.
101 | - Intensity transformation to log: The user can specify the `--log2` parameter to transform the peptide intensity values to log2 before normalization.
102 | 
103 | > Note: At the moment, ibaqpy computes the ibaq values only based on unique peptides. Shared peptides are discarded. However, if a group of proteins share the same unique peptides (e.g., Pep1 -> Prot1;Prot2 and Pep2 -> Prot1;Prot2), the intensity of the proteins is summed and divided by the number of proteins in the group.
104 | 
105 | #### Calculate the IBAQ Value
106 | 
107 | First, peptide intensity dataframe was grouped according to protein name, sample name and condition. The protein intensity of each group was summed up. Due to the experimental type, the same protein may exhibit missing peptides in different samples, resulting in variations in the number of peptides detected for the protein across different samples. To handle this difference, normalization within the same group can be achieved by using the formula `sum(peptides) / n`(n represents the number of detected peptide segments). Finally, the normalized intensity of the protein is divided by the number of theoretical peptides.See details in `peptides2proteins`.
108 | 
109 | > Note: In all scripts and result files, _uniprot accession_ is used as the protein identifier.
110 | 
111 | ### How to install ibaqpy
112 | 
113 | Ibaqpy is available in PyPI and can be installed using pip:
114 | 
115 | ```asciidoc
116 | pip install ibaqpy
117 | ```
118 | 
119 | You can install the package from code:
120 | 
121 | 1. Clone the repository:
122 | 
123 | ```asciidoc
124 | >$ git clone https://github.com/bigbio/ibaqpy
125 | >$ cd ibaqpy
126 | ```
127 | 
128 | 2. Install conda environment:
129 | 
130 | ```asciidoc
131 | >$ mamba env create -f conda-environment.yaml
132 | ```
133 | 
134 | 3. Install ibaqpy:
135 | 
136 | ```asciidoc
137 | >$ python setup.py install
138 | ```
139 | 
140 | ### Collecting intensity files from quantms.org
141 | 
142 | Absolute quantification files have been stored in the following url:
143 | 
144 | ```
145 | https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/absolute-expression/quantms-data/
146 | ```
147 | 
148 | Inside each project reanalysis folder, the folder proteomicslfq contains the msstats input file with the structure `{Name of the project}.{Random uuid}.feature.parquet	`.
149 | 
150 | E.g. http://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/absolute-expression/quantms-data/MSV000079033.1/MSV000079033.1-bd44c7e3-654c-444d-9e21-0f701d6dac94.feature.parquet
151 | 
152 | ### Major commands
153 | 
154 | #### Features to peptides
155 | 
156 | ```asciidoc
157 | ibaqpy features2peptides -p tests/PXD003947/PXD003947-feature.parquet -s tests/PXD003947/PXD003947.sdrf.tsv --remove_ids data/contaminants_ids.tsv --remove_decoy_contaminants --remove_low_frequency_peptides --output tests/PXD003947/PXD003947-peptides-norm.csv
158 | ```
159 | 
160 | ```asciidoc
161 | Usage: features2peptides.py [OPTIONS]
162 | 
163 | Options:
164 |   -p, --parquet TEXT              Parquet file import generated by quantms.io
165 |   -s, --sdrf TEXT                 SDRF file import generated by quantms
166 |   --min_aa INTEGER                Minimum number of amino acids to filter
167 |                                   peptides
168 |   --min_unique INTEGER            Minimum number of unique peptides to filter
169 |                                   proteins
170 |   --remove_ids TEXT               Remove specific protein ids from the
171 |                                   analysis using a file with one id per line
172 |   --remove_decoy_contaminants     Remove decoy and contaminants proteins from
173 |                                   the analysis
174 |   --remove_low_frequency_peptides
175 |                                   Remove peptides that are present in less
176 |                                   than 20% of the samples
177 |   --output TEXT                   Peptide intensity file including other all
178 |                                   properties for normalization
179 |   --skip_normalization            Skip normalization step
180 |   --nmethod TEXT                  Normalization method used to normalize
181 |                                   feature intensities for tec
182 |                                   (options: mean, median, iqr, none)
183 |   --pnmethod TEXT                 Normalization method used to normalize
184 |                                   peptides intensities for all samples
185 |                                   (options: globalMedian,conditionMedian,none)
186 |   --log2                          Transform to log2 the peptide intensity
187 |                                   values before normalization
188 |   --save_parquet                  Save normalized peptides to parquet
189 |   --help                          Show this message and exit.
190 | ```
191 | 
192 | #### Compute IBAQ/TPA
193 | 
194 | ```asciidoc
195 | ibaqpy peptides2protein -f Homo-sapiens-uniprot-reviewed-contaminants-decoy-202210.fasta -p PXD017834-peptides.csv -e Trypsin -n -t -r --ploidy 2 --cpc 200 --organism human --output PXD003947.tsv --verbose
196 | ```
197 | 
198 | ```asciidoc
199 | Usage: peptides2protein [OPTIONS]
200 | 
201 | Options:
202 |   -f, --fasta TEXT     Protein database to compute IBAQ values
203 |   -p, --peptides TEXT  Peptide identifications with intensities following the
204 |                        peptide intensity output
205 |   -e, --enzyme TEXT    Enzyme used during the analysis of the dataset
206 |                        (default: Trypsin)
207 |   -n, --normalize      Normalize IBAQ values using by using the total IBAQ of
208 |                        the experiment
209 |   --min_aa INTEGER     Minimum number of amino acids to consider a peptide
210 |   --max_aa INTEGER     Maximum number of amino acids to consider a peptide
211 |   -t, --tpa            Whether calculate TPA (is_flag=True)
212 |   -r, --ruler          Whether to use ProteomicRuler (is_flag=True)
213 |   -i, --ploidy         Ploidy number (default=2)
214 |   -m, --organism       Organism source of the data (default human)
215 |   -c, --cpcCellular    protein concentration(g/L)(default 200)
216 |   -o, --output TEXT    Output file with the proteins and ibaq values
217 |   --verbose            Print addition information about the distributions of
218 |                        the intensities, number of peptides remove after
219 |                        normalization, etc.
220 |   --qc_report TEXT     PDF file to store multiple QC images
221 |   --help               Show this message and exit.
222 | ```
223 | 
224 | ### Citation
225 | 
226 | > Zheng P, Audain E, Webel H, Dai C, Klein J, Hitz MP, Sachsenberg T, Bai M, Perez-Riverol Y. ibaqpy: A scalable Python package for baseline quantification in proteomics leveraging SDRF metadata. bioRxiv 2025.02.08.637208; doi: https://doi.org/10.1101/2025.02.08.637208
227 | 
228 | Other relevant publications:
229 | 
230 | > Wang H, Dai C, Pfeuffer J, Sachsenberg T, Sanchez A, Bai M, Perez-Riverol Y. Tissue-based absolute quantification using large-scale TMT and LFQ experiments. Proteomics. 2023 Oct;23(20):e2300188. doi: [10.1002/pmic.202300188](https://analyticalsciencejournals.onlinelibrary.wiley.com/doi/10.1002/pmic.202300188). Epub 2023 Jul 24. PMID: 37488995.
231 | 
232 | ### Credits
233 | 
234 | - [Julianus Pfeuffer](@jpfeuffer)
235 | - [Yasset Perez-Riverol](@ypriverol)
236 | - [Hong Wang](@WangHong007)
237 | - [Ping Zheng](@zprobot)
238 | - [Joshua Klein](@mobiusklein)
239 | - [Enrique Audain](@enriquea)
240 | 


--------------------------------------------------------------------------------
/benchmarks/images/9_tissues-boxplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/9_tissues-boxplot.png


--------------------------------------------------------------------------------
/benchmarks/images/9_tissues-density.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/9_tissues-density.png


--------------------------------------------------------------------------------
/benchmarks/images/PXD007683-11samples-density.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD007683-11samples-density.png


--------------------------------------------------------------------------------
/benchmarks/images/PXD007683-LFQ-11samples-ibaq-ibaqpy-and-maxquant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD007683-LFQ-11samples-ibaq-ibaqpy-and-maxquant.png


--------------------------------------------------------------------------------
/benchmarks/images/PXD007683-LFQ-11samples-ibaq-vs-maxquant-density.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD007683-LFQ-11samples-ibaq-vs-maxquant-density.png


--------------------------------------------------------------------------------
/benchmarks/images/PXD007683-LFQ-11samples-no_cov.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD007683-LFQ-11samples-no_cov.png


--------------------------------------------------------------------------------
/benchmarks/images/PXD007683-LFQ-ibaq-ibaqpy-and-maxquant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD007683-LFQ-ibaq-ibaqpy-and-maxquant.png


--------------------------------------------------------------------------------
/benchmarks/images/PXD007683-LFQ-ibaq-vs-maxquant-density.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD007683-LFQ-ibaq-vs-maxquant-density.png


--------------------------------------------------------------------------------
/benchmarks/images/PXD007683-LFQ-no_cov.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD007683-LFQ-no_cov.png


--------------------------------------------------------------------------------
/benchmarks/images/PXD007683-TMTvsLFQ-boxplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD007683-TMTvsLFQ-boxplot.png


--------------------------------------------------------------------------------
/benchmarks/images/PXD007683-TMTvsLFQ-density.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD007683-TMTvsLFQ-density.png


--------------------------------------------------------------------------------
/benchmarks/images/PXD019909-11samples-density.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD019909-11samples-density.png


--------------------------------------------------------------------------------
/benchmarks/images/PXD019909-TMTvsLFQ-density.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/PXD019909-TMTvsLFQ-density.png


--------------------------------------------------------------------------------
/benchmarks/images/fold_change_lfq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/fold_change_lfq.png


--------------------------------------------------------------------------------
/benchmarks/images/fold_change_tmt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/fold_change_tmt.png


--------------------------------------------------------------------------------
/benchmarks/images/method_mean_cv_016999_lfq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/method_mean_cv_016999_lfq.png


--------------------------------------------------------------------------------
/benchmarks/images/method_mean_cv_lfq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/method_mean_cv_lfq.png


--------------------------------------------------------------------------------
/benchmarks/images/method_mean_cv_tmt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/method_mean_cv_tmt.png


--------------------------------------------------------------------------------
/benchmarks/images/method_per_p_cv_016999_lfq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/method_per_p_cv_016999_lfq.png


--------------------------------------------------------------------------------
/benchmarks/images/method_per_p_cv_lfq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/method_per_p_cv_lfq.png


--------------------------------------------------------------------------------
/benchmarks/images/method_per_p_cv_tmt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/method_per_p_cv_tmt.png


--------------------------------------------------------------------------------
/benchmarks/images/missing_peptides_by_sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/missing_peptides_by_sample.png


--------------------------------------------------------------------------------
/benchmarks/images/missing_value_016999_lfq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/missing_value_016999_lfq.png


--------------------------------------------------------------------------------
/benchmarks/images/per_protein_cv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/benchmarks/images/per_protein_cv.png


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/data/__init__.py


--------------------------------------------------------------------------------
/data/contaminants_ids.tsv:
--------------------------------------------------------------------------------
  1 | P00761
  2 | Q32MB2
  3 | P19013
  4 | Q7RTT2
  5 | P15636
  6 | P09870
  7 | Q9R4J5
  8 | P0C1U8
  9 | P00766
 10 | P13717
 11 | Q9U6Y5
 12 | P21578
 13 | O76009
 14 | O76011
 15 | O76013
 16 | O76014
 17 | O76015
 18 | P08779
 19 | Q14525
 20 | Q14532
 21 | Q15323
 22 | Q92764
 23 | Q14533
 24 | Q9NSB4
 25 | P78385
 26 | Q9NSB2
 27 | P78386
 28 | O43790
 29 | Q6IFU5
 30 | Q9UE12
 31 | Q8IUT8
 32 | Q6NT21
 33 | Q6ISB0
 34 | Q6NTB9
 35 | Q6IFU6
 36 | P04264
 37 | P13647
 38 | P35908
 39 | P13645
 40 | P35527
 41 | A3EZ79
 42 | P02533
 43 | P02538
 44 | P48668
 45 | P04259
 46 | A3EZ82
 47 | Q2KIG3
 48 | Q0VCM5
 49 | Q3SZ57
 50 | Q9N2I2
 51 | Q3SZH5
 52 | P28800
 53 | Q1A7A4
 54 | P41361
 55 | Q2YDI2
 56 | Q3Y5Z3
 57 | P81644
 58 | Q2KJ83
 59 | Q2KIT0
 60 | A2I7N3
 61 | Q3SZV7
 62 | Q2KJC7
 63 | Q3SZR3
 64 | Q28107
 65 | P02672
 66 | Q1RMN8
 67 | Q58D62
 68 | P06868
 69 | Q2KJF1
 70 | P02584
 71 | P02777
 72 | Q3SX14
 73 | P17697
 74 | Q6T181
 75 | P34955
 76 | P21752
 77 | Q32PJ2
 78 | Q28194
 79 | P00978
 80 | Q5XQN5
 81 | Q32PI4
 82 | Q9TTE1
 83 | Q2KIU3
 84 | P01044-1
 85 | P67983
 86 | Q28065
 87 | Q862S4
 88 | Q2KIF2
 89 | Q3SX28
 90 | Q0V8M9
 91 | Q148H6
 92 | Q29RQ1
 93 | Q95M17
 94 | P07224
 95 | Q2HJF0
 96 | Q2KIH2
 97 | P13646-1
 98 | Q04695
 99 | A2I7N0
100 | P12763
101 | P17690
102 | P02769
103 | P02676
104 | P50448
105 | P01030
106 | P01966
107 | P02768-1
108 | P00735
109 | Q03247
110 | Q3ZBS7
111 | Q2UVX4
112 | Q9TT36
113 | Q28085
114 | Q3SX09
115 | P01045-1
116 | Q3ZBD7
117 | Q3MHN2
118 | Q9TRI1
119 | P15497
120 | Q95121
121 | Q05443
122 | P02070
123 | Q2KIS7
124 | Q3MHH8
125 | Q3T052
126 | Q3KUS7
127 | Q1RMK2
128 | Q2TBQ1
129 | Q05B55
130 | A2I7N1
131 | P04258
132 | Q2KJ62
133 | Q0IIK2
134 | Q3MHN5
135 | P02662
136 | P02663
137 | P02666
138 | P02668
139 | P31096
140 | P02754
141 | P00711
142 | P62894
143 | Q29443
144 | P19001
145 | A2AB72
146 | Q8VED5
147 | Q61726
148 | Q3ZAW8
149 | P50446
150 | Q497I4
151 | Q9D312
152 | P08730-1
153 | Q922U2
154 | Q8BGZ7
155 | A2A4G1
156 | Q9QWL7
157 | Q6IME9
158 | Q6NXH9
159 | A2VCT4
160 | P07744
161 | Q6IFZ6
162 | Q6IFX2
163 | Q9R0H5
164 | Q3TTY5
165 | Q0VBK2
166 | P02535-1
167 | Q61782
168 | A2A5Y0
169 | Q99PS0
170 | Q9D646
171 | P05784
172 | Q9DCV7
173 | Q9Z2K1
174 | P07477
175 | P05787
176 | Q6KB66-1
177 | Q7Z794
178 | Q9BYR9
179 | Q9BYQ5
180 | Q9BYR8
181 | Q9BYQ7
182 | Q3LI72
183 | Q9BYR4
184 | Q9BYQ8
185 | P60413
186 | P19012
187 | Q2M2I5
188 | O95678
189 | Q01546
190 | Q99456
191 | Q9H552
192 | P35900
193 | Q3SY84
194 | Q8N1A0
195 | Q8N1N4-2
196 | Q5XKE5
197 | P12035
198 | Q9C075
199 | P08729
200 | Q7Z3Y8
201 | Q7RTS7
202 | Q7Z3Y9
203 | Q7Z3Z0
204 | Q7Z3Y7
205 | P08727
206 | Q14CN4-1
207 | Q3KNV1
208 | Q86YZ3
209 | P20930
210 | Q5D862
211 | SPA34_BOVIN
212 | SPA35_BOVIN
213 | SPA37_BOVIN
214 | KRT86_HUMAN
215 | KT33A_HUMAN
216 | KRT34_HUMAN
217 | KRT36_HUMAN
218 | KRT36_HUMAN
219 | KRT37_HUMAN
220 | KRT38_HUMAN
221 | K2C75_HUMAN
222 | LALBA_BOVIN
223 | THRB_BOVIN
224 | TRYP_PIG
225 | CTRA_BOVIN
226 | AMBP_BOVIN
227 | CO4_BOVIN
228 | KNG1_BOVIN
229 | KNG1_BOVIN
230 | KNG1_BOVIN
231 | KNG2_BOVIN
232 | HBA_BOVIN
233 | HBB_BOVIN
234 | HBBF_BOVIN
235 | K1C14_HUMAN
236 | K1C10_MOUSE
237 | K2C6A_HUMAN
238 | PROF1_BOVIN
239 | CASA1_BOVIN
240 | CASA2_BOVIN
241 | CASB_BOVIN
242 | CASK_BOVIN
243 | FIBA_BOVIN
244 | FIBB_BOVIN
245 | LACB_BOVIN
246 | ALBU_HUMAN
247 | ALBU_BOVIN
248 | PLF4_BOVIN
249 | CO3A1_BOVIN
250 | K2C6B_HUMAN
251 | K2C1_HUMAN
252 | K1C18_MOUSE
253 | K2C8_HUMAN
254 | K2C8_HUMAN
255 | PLMN_BOVIN
256 | PROS_BOVIN
257 | TRY1_HUMAN
258 | K2C4_MOUSE
259 | K1C19_HUMAN
260 | K2C7_HUMAN
261 | K1C13_MOUSE
262 | K1C16_HUMAN
263 | CLOS_HATHI
264 | SSPA_STAAU
265 | K2C3_HUMAN
266 | FETUA_BOVIN
267 | K1C10_HUMAN
268 | K1C13_HUMAN
269 | K2C5_HUMAN
270 | NUCA_SERMA
271 | APOA1_BOVIN
272 | API_ACHLY
273 | APOH_BOVIN
274 | CLUS_BOVIN
275 | K1C19_MOUSE
276 | K1C15_HUMAN
277 | K1C15_HUMAN
278 | K2C4_HUMAN
279 | FILA_HUMAN
280 | LUXY_ALIFS
281 | TYB10_BOVIN
282 | A2AP_BOVIN
283 | OSTP_BOVIN
284 | A1AT_BOVIN
285 | K1C9_HUMAN
286 | K1C20_HUMAN
287 | K22E_HUMAN
288 | ANT3_BOVIN
289 | K2C6C_HUMAN
290 | K2C6A_MOUSE
291 | F12AI_BOVIN
292 | ITIH3_BOVIN
293 | KR10C_HUMAN
294 | CYC_BOVIN
295 | MT1A_BOVIN
296 | KRT83_HUMAN
297 | KRT85_HUMAN
298 | APOA2_BOVIN
299 | K22O_HUMAN
300 | APOE_BOVIN
301 | K1C17_HUMAN
302 | LUM_BOVIN
303 | K2C80_MOUSE
304 | ITIH1_BOVIN
305 | KT33B_HUMAN
306 | K1H2_HUMAN
307 | KRT81_HUMAN
308 | K1C28_BOVIN
309 | K2C72_HUMAN
310 | K1H1_HUMAN
311 | C4BPA_BOVIN
312 | CFAH_BOVIN
313 | FA5_BOVIN
314 | TRFE_BOVIN
315 | CO7_BOVIN
316 | CBPB2_BOVIN
317 | TETN_BOVIN
318 | HP20_BOVIN
319 | HP252_BOVIN
320 | CBPN_BOVIN
321 | A1BG_BOVIN
322 | K1C24_HUMAN
323 | CO3_BOVIN
324 | ORC4_BOVIN
325 | APOA4_BOVIN
326 | KR195_HUMAN
327 | CO9_BOVIN
328 | VTDB_BOVIN
329 | GELS_BOVIN
330 | K2C71_HUMAN
331 | FETA_BOVIN
332 | A1AG_BOVIN
333 | HEMO_BOVIN
334 | ITIH4_BOVIN
335 | K22E_MOUSE
336 | ADIPO_BOVIN
337 | G6PI_BOVIN
338 | KRT35_MOUSE
339 | FETUB_BOVIN
340 | FILA2_HUMAN
341 | TPM2_BOVIN
342 | TPM2_BOVIN
343 | K2C79_HUMAN
344 | K2C5_BOVIN
345 | K1H1_MOUSE
346 | K1C40_HUMAN
347 | K1C39_HUMAN
348 | DMKN_HUMAN
349 | DMKN_HUMAN
350 | DMKN_HUMAN
351 | DMKN_HUMAN
352 | DMKN_HUMAN
353 | DMKN_HUMAN
354 | DMKN_HUMAN
355 | DMKN_HUMAN
356 | DMKN_HUMAN
357 | DMKN_HUMAN
358 | DMKN_HUMAN
359 | DMKN_HUMAN
360 | DMKN_HUMAN
361 | DMKN_HUMAN
362 | DMKN_HUMAN
363 | DMKN_HUMAN
364 | K1C42_MOUSE
365 | K1C39_MOUSE
366 | K2C1B_MOUSE
367 | K2C72_MOUSE
368 | K2C80_HUMAN
369 | K2C73_MOUSE
370 | K2C74_HUMAN
371 | K1C28_HUMAN
372 | K1C27_HUMAN
373 | K1C26_HUMAN
374 | K1C25_HUMAN
375 | K2C1B_HUMAN
376 | K2C73_HUMAN
377 | K2C73_HUMAN
378 | HORN_HUMAN
379 | K2C75_MOUSE
380 | KT222_HUMAN
381 | KT222_HUMAN
382 | K2C78_HUMAN
383 | K2C78_HUMAN
384 | K2C79_MOUSE
385 | K2C5_MOUSE
386 | KRT35_HUMAN
387 | PEDF_BOVIN
388 | CHIA_BOVIN
389 | K1C12_HUMAN
390 | K1C23_MOUSE
391 | KRA46_HUMAN
392 | KRA41_HUMAN
393 | KRA49_HUMAN
394 | KRA43_HUMAN
395 | KRA31_HUMAN
396 | KRA24_HUMAN
397 | K1C23_HUMAN
398 | K1C23_HUMAN
399 | K1C20_MOUSE
400 | KRT34_MOUSE
401 | K2C7_MOUSE
402 | IPSP_BOVIN
403 | KRT84_HUMAN
404 | KRT82_HUMAN
405 | K1C17_MOUSE
406 | K2C71_MOUSE
407 | ASPN_PSEFR
408 | THBG_BOVIN
409 | SPA31_BOVIN
410 | GFPL1_ZOASP
411 | K1C16_MOUSE
412 | trY2_BOVIN
413 | trY1_BOVIN
414 | Streptavidin
415 | REFSEQ:XP_986630
416 | REFSEQ:XP_001474382
417 | REFSEQ:XP_092267
418 | REFSEQ:XP_932229
419 | H-INV:HIT000016045
420 | H-INV:HIT000292931
421 | H-INV:HIT000015463
422 | ENSEMBL:ENSP00000377550
423 | ENSEMBL:ENSBTAP00000006074
424 | ENSEMBL:ENSBTAP00000038329
425 | REFSEQ:XP_001252647
426 | ENSEMBL:ENSBTAP00000007350
427 | ENSEMBL:ENSBTAP00000038253
428 | ENSEMBL:ENSBTAP00000023402
429 | ENSEMBL:ENSBTAP00000024466
430 | ENSEMBL:ENSBTAP00000023055
431 | ENSEMBL:ENSBTAP00000018229
432 | ENSEMBL:ENSBTAP00000016046
433 | ENSEMBL:ENSBTAP00000024462
434 | ENSEMBL:ENSBTAP00000014147
435 | ENSEMBL:ENSBTAP00000033053
436 | ENSEMBL:ENSBTAP00000001528
437 | ENSEMBL:ENSBTAP00000037665
438 | ENSEMBL:ENSBTAP00000031900
439 | ENSEMBL:ENSBTAP00000031360
440 | ENSEMBL:ENSBTAP00000018574
441 | ENSEMBL:ENSBTAP00000032840
442 | ENSEMBL:ENSBTAP00000011227
443 | ENSEMBL:ENSBTAP00000025008
444 | ENSEMBL:ENSBTAP00000034412
445 | ENSEMBL:ENSBTAP00000013050
446 | ENSEMBL:ENSBTAP00000016285
447 | ENSEMBL:ENSBTAP00000024146
448 | REFSEQ:XP_58501
449 | 


--------------------------------------------------------------------------------
/data/high_abundant_proteins.tsv:
--------------------------------------------------------------------------------
1 | P68871
2 | HBB_HUMAN
3 | 
4 | 


--------------------------------------------------------------------------------
/data/histones.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "HUMAN": {
  3 |     "name": "human",
  4 |     "genome_size": 3220000000,
  5 |     "histone_proteins": [
  6 |       "P07305",
  7 |       "Q8IZA3",
  8 |       "Q92522",
  9 |       "P0C5Y9",
 10 |       "P0C5Z0",
 11 |       "H0YFX9",
 12 |       "Q9BTM1",
 13 |       "A8MQC5",
 14 |       "C9J0D1",
 15 |       "C9J386",
 16 |       "E5RJU1",
 17 |       "Q71UI9",
 18 |       "P16104",
 19 |       "B4DJC3",
 20 |       "D6RCF2",
 21 |       "O75367",
 22 |       "Q5SQT3",
 23 |       "Q9P0M6",
 24 |       "P0C0S5",
 25 |       "P0C1H6",
 26 |       "A9UJN3",
 27 |       "P57053",
 28 |       "Q7Z2G1",
 29 |       "B4DEB1",
 30 |       "P84243",
 31 |       "B2R4P9",
 32 |       "K7EMV3",
 33 |       "K7ES00",
 34 |       "K7EK07",
 35 |       "K7EP01",
 36 |       "Q6NXT2",
 37 |       "Q02539",
 38 |       "P16401",
 39 |       "P16403",
 40 |       "P16402",
 41 |       "Q4VB24",
 42 |       "P10412",
 43 |       "A3R0T8",
 44 |       "A1L407",
 45 |       "P22492",
 46 |       "Q96QV6",
 47 |       "P04908",
 48 |       "Q08AJ9",
 49 |       "Q93077",
 50 |       "P20671",
 51 |       "P0C0S8",
 52 |       "A3KPC7",
 53 |       "Q96KK5",
 54 |       "Q99878",
 55 |       "A4FTV9",
 56 |       "Q92646",
 57 |       "Q96A08",
 58 |       "P33778",
 59 |       "P62807",
 60 |       "P58876",
 61 |       "B2R4S9",
 62 |       "Q93079",
 63 |       "P06899",
 64 |       "O60814",
 65 |       "Q99880",
 66 |       "I6L9F7",
 67 |       "Q99879",
 68 |       "Q99877",
 69 |       "P23527",
 70 |       "P68431",
 71 |       "P62805",
 72 |       "Q99525",
 73 |       "Q0VAS5",
 74 |       "B2R4R0",
 75 |       "Q6FI13",
 76 |       "Q8IUE6",
 77 |       "Q16777",
 78 |       "Q16778",
 79 |       "B4DR52",
 80 |       "Q5QNW6",
 81 |       "Q71DI3",
 82 |       "Q5TEC6",
 83 |       "Q7L7L0",
 84 |       "Q8N257",
 85 |       "Q16695",
 86 |       "Q6TXQ4",
 87 |       "Q14463",
 88 |       "B4E0B3",
 89 |       "B2R5B6",
 90 |       "A2RUA4",
 91 |       "B2R5B3",
 92 |       "Q9HA11",
 93 |       "A8K9J7",
 94 |       "B2R6Y1",
 95 |       "B4E380",
 96 |       "A8K4Y7",
 97 |       "Q6B823",
 98 |       "Q6LBZ2",
 99 |       "A3R0T7"
100 |     ],
101 |     "histone_entries": [
102 |       "H2AW_HUMAN",
103 |       "Q9HA11_HUMAN",
104 |       "H2AJ_HUMAN",
105 |       "H2B1L_HUMAN",
106 |       "H2B1M_HUMAN",
107 |       "H2A1J_HUMAN",
108 |       "H2B1N_HUMAN",
109 |       "H4G_HUMAN",
110 |       "H2A1A_HUMAN",
111 |       "H2A1H_HUMAN",
112 |       "H2B1A_HUMAN",
113 |       "H2B1H_HUMAN",
114 |       "H2A1C_HUMAN",
115 |       "Q92646_HUMAN",
116 |       "H1X_HUMAN",
117 |       "H2B3B_HUMAN",
118 |       "H18_HUMAN",
119 |       "H2A2B_HUMAN",
120 |       "H2BWT_HUMAN",
121 |       "H2A3_HUMAN",
122 |       "H2AV_HUMAN",
123 |       "H2AV_HUMAN",
124 |       "H32_HUMAN",
125 |       "Q6TXQ4_HUMAN",
126 |       "H3C_HUMAN",
127 |       "Q6LBZ2_HUMAN",
128 |       "H2A2A_HUMAN",
129 |       "Q6B823_HUMAN",
130 |       "H37_HUMAN",
131 |       "Q5SQT3_HUMAN",
132 |       "H2B2F_HUMAN",
133 |       "Q4VB24_HUMAN",
134 |       "H2B2E_HUMAN",
135 |       "H2A2C_HUMAN",
136 |       "H31T_HUMAN",
137 |       "Q14463_HUMAN",
138 |       "Q0VAS5_HUMAN",
139 |       "Q08AJ9_HUMAN",
140 |       "H11_HUMAN",
141 |       "H33_HUMAN",
142 |       "H31_HUMAN",
143 |       "H2B1C_HUMAN",
144 |       "H4_HUMAN",
145 |       "H2B1D_HUMAN",
146 |       "H2BFS_HUMAN",
147 |       "H2B1B_HUMAN",
148 |       "H2B1O_HUMAN",
149 |       "H1T_HUMAN",
150 |       "H2A1D_HUMAN",
151 |       "H12_HUMAN",
152 |       "H13_HUMAN",
153 |       "H15_HUMAN",
154 |       "H2AX_HUMAN",
155 |       "H14_HUMAN",
156 |       "H2AB2_HUMAN",
157 |       "H2AB1_HUMAN",
158 |       "H2BFM_HUMAN",
159 |       "H2A1_HUMAN",
160 |       "H2AZ_HUMAN",
161 |       "H10_HUMAN",
162 |       "H2B1J_HUMAN",
163 |       "H2A1B_HUMAN",
164 |       "H2AY_HUMAN",
165 |       "H2B1K_HUMAN",
166 |       "K7ES00_HUMAN",
167 |       "K7EP01_HUMAN",
168 |       "K7EMV3_HUMAN",
169 |       "K7EK07_HUMAN",
170 |       "I6L9F7_HUMAN",
171 |       "H0YFX9_HUMAN",
172 |       "E5RJU1_HUMAN",
173 |       "D6RCF2_HUMAN",
174 |       "C9J386_HUMAN",
175 |       "C9J0D1_HUMAN",
176 |       "B4E380_HUMAN",
177 |       "B4E0B3_HUMAN",
178 |       "B4DR52_HUMAN",
179 |       "B4DJC3_HUMAN",
180 |       "B4DEB1_HUMAN",
181 |       "B2R6Y1_HUMAN",
182 |       "B2R5B6_HUMAN",
183 |       "B2R5B3_HUMAN",
184 |       "B2R4S9_HUMAN",
185 |       "B2R4R0_HUMAN",
186 |       "B2R4P9_HUMAN",
187 |       "A9UJN3_HUMAN",
188 |       "A8K9J7_HUMAN",
189 |       "A8K4Y7_HUMAN",
190 |       "A4FTV9_HUMAN",
191 |       "A3R0T8_HUMAN",
192 |       "A3R0T7_HUMAN",
193 |       "A3KPC7_HUMAN",
194 |       "A2RUA4_HUMAN",
195 |       "A1L407_HUMAN"
196 |     ]
197 |   },
198 |   "MOUSE": {
199 |     "name": "mouse",
200 |     "genome_size": 2800000000,
201 |     "histone_proteins": [
202 |       "Q9DAD9",
203 |       "B2RTM0",
204 |       "Q8CBB6",
205 |       "Q921L4",
206 |       "Q5M8Q2",
207 |       "Q810S6",
208 |       "B1AV31",
209 |       "Q497L1",
210 |       "A9Z055",
211 |       "Q8CGP9",
212 |       "P10922",
213 |       "Q8CJI4",
214 |       "E0CZ52",
215 |       "E0CYL2",
216 |       "Q8VIK3",
217 |       "Q80ZM5",
218 |       "Q9CQ70",
219 |       "Q8R1M2",
220 |       "Q3THW5",
221 |       "Q8R029",
222 |       "B2RVP5",
223 |       "P27661",
224 |       "Q9QZQ8",
225 |       "Q8CA90",
226 |       "Q8BP16",
227 |       "Q9CTR1",
228 |       "Q8CCK0",
229 |       "Q9D3V6",
230 |       "Q9D3U7",
231 |       "Q3UA95",
232 |       "Q3TFU6",
233 |       "G3UWL7",
234 |       "G3UX40",
235 |       "P0C0S6",
236 |       "F8WI35",
237 |       "E0CZ27",
238 |       "E0CYN1",
239 |       "E0CYR7",
240 |       "P84244",
241 |       "P02301",
242 |       "Q9QYL0",
243 |       "P43275",
244 |       "P43276",
245 |       "P15864",
246 |       "Q5SZA3",
247 |       "P43277",
248 |       "Q149Z9",
249 |       "P43274",
250 |       "Q07133",
251 |       "I7HFT9",
252 |       "Q8CGP4",
253 |       "P22752",
254 |       "B2RVF0",
255 |       "Q61668",
256 |       "Q8CGP5",
257 |       "A0AUV1",
258 |       "Q8CGP6",
259 |       "A3KPD0",
260 |       "Q8CGP7",
261 |       "F8WIX8",
262 |       "A0JNS9",
263 |       "P70696",
264 |       "Q64475",
265 |       "Q6ZWY9",
266 |       "P10853",
267 |       "Q64478",
268 |       "A0JLV3",
269 |       "Q8CGP1",
270 |       "B2RVD5",
271 |       "P10854",
272 |       "B2RTK3",
273 |       "Q8CGP2",
274 |       "P68433",
275 |       "P84228",
276 |       "A1L0U3",
277 |       "A1L0V4",
278 |       "P62806",
279 |       "B2RWH3",
280 |       "Q6GSS7",
281 |       "Q64522",
282 |       "Q64523",
283 |       "Q149V4",
284 |       "Q64525",
285 |       "G3X9D5",
286 |       "Q64524",
287 |       "B9EI85",
288 |       "Q61667",
289 |       "Q8BFU2",
290 |       "A2AB79",
291 |       "Q9D2U9",
292 |       "Q8CGP0",
293 |       "Q6B822",
294 |       "P07978",
295 |       "Q9D9Z7"
296 |     ]
297 |   },
298 |   "DROME": {
299 |     "name": "drome",
300 |     "genome_size": 144000000,
301 |     "histone_proteins": [
302 |       "Q6TXQ1",
303 |       "P02255",
304 |       "Q4AB54",
305 |       "Q4ABE3",
306 |       "Q4ABD8",
307 |       "Q4AB94",
308 |       "P84051",
309 |       "Q4AB57",
310 |       "P08985",
311 |       "P02283",
312 |       "P02299",
313 |       "E2QCP0",
314 |       "P84249",
315 |       "P84040"
316 |     ],
317 |     "histone_entries": [
318 |       "Q9DAD9_MOUSE",
319 |       "B2RTM0_MOUSE",
320 |       "Q8CBB6_MOUSE",
321 |       "Q921L4_MOUSE",
322 |       "H2AL1_MOUSE",
323 |       "Q810S6_MOUSE",
324 |       "Q9DAD9_MOUSE",
325 |       "Q497L1_MOUSE",
326 |       "A9Z055_MOUSE",
327 |       "Q8CGP9_MOUSE",
328 |       "H10_MOUSE",
329 |       "H1FNT_MOUSE",
330 |       "E0CZ52_MOUSE",
331 |       "E0CYL2_MOUSE",
332 |       "H18_MOUSE",
333 |       "Q80ZM5_MOUSE",
334 |       "H2AB1_MOUSE",
335 |       "H2AJ_MOUSE",
336 |       "H2AV_MOUSE",
337 |       "Q8R029_MOUSE",
338 |       "B2RVP5_MOUSE",
339 |       "H2AX_MOUSE",
340 |       "H2AY_MOUSE",
341 |       "Q8CA90_MOUSE",
342 |       "Q8BP16_MOUSE",
343 |       "Q9CTR1_MOUSE",
344 |       "H2AW_MOUSE",
345 |       "Q9D3V6_MOUSE",
346 |       "Q9D3U7_MOUSE",
347 |       "Q3UA95_MOUSE",
348 |       "Q3TFU6_MOUSE",
349 |       "G3UWL7_MOUSE",
350 |       "G3UX40_MOUSE",
351 |       "H2AZ_MOUSE",
352 |       "F8WI35_MOUSE",
353 |       "E0CZ27_MOUSE",
354 |       "E0CYN1_MOUSE",
355 |       "E0CYR7_MOUSE",
356 |       "H33_MOUSE",
357 |       "H3C_MOUSE",
358 |       "HILS1_MOUSE",
359 |       "H11_MOUSE",
360 |       "H15_MOUSE",
361 |       "H12_MOUSE",
362 |       "Q5SZA3_MOUSE",
363 |       "H13_MOUSE",
364 |       "Q149Z9_MOUSE",
365 |       "H14_MOUSE",
366 |       "H1T_MOUSE",
367 |       "I7HFT9_MOUSE",
368 |       "Q8CGP4_MOUSE",
369 |       "H2A1P_MOUSE",
370 |       "H2A1B_MOUSE",
371 |       "H2A1C_MOUSE",
372 |       "H2A1D_MOUSE",
373 |       "H2A1E_MOUSE",
374 |       "H2A1G_MOUSE",
375 |       "H2A1I_MOUSE",
376 |       "H2A1N_MOUSE",
377 |       "H2A1O_MOUSE",
378 |       "B2RVF0_MOUSE",
379 |       "Q61668_MOUSE",
380 |       "H2A1F_MOUSE",
381 |       "A0AUV1_MOUSE",
382 |       "H2A1H_MOUSE",
383 |       "A3KPD0_MOUSE",
384 |       "H2A1K_MOUSE",
385 |       "A0JNS9_MOUSE",
386 |       "H2B1A_MOUSE",
387 |       "H2B1B_MOUSE",
388 |       "H2B1C_MOUSE",
389 |       "H2B1F_MOUSE",
390 |       "H2B1H_MOUSE",
391 |       "A0JLV3_MOUSE",
392 |       "H2B1K_MOUSE",
393 |       "B2RVD5_MOUSE",
394 |       "H2B1M_MOUSE",
395 |       "B2RTK3_MOUSE",
396 |       "H2B1P_MOUSE",
397 |       "H31_MOUSE",
398 |       "H32_MOUSE",
399 |       "A1L0U3_MOUSE",
400 |       "A1L0V4_MOUSE",
401 |       "H4_MOUSE",
402 |       "B2RWH3_MOUSE",
403 |       "H2A2A_MOUSE",
404 |       "H2A2B_MOUSE",
405 |       "H2A2C_MOUSE",
406 |       "Q149V4_MOUSE",
407 |       "H2B2B_MOUSE",
408 |       "H2B2E_MOUSE",
409 |       "B9EI85_MOUSE",
410 |       "Q61667_MOUSE",
411 |       "H2A3_MOUSE",
412 |       "A2AB79_MOUSE",
413 |       "H2B3A_MOUSE",
414 |       "H2B3B_MOUSE",
415 |       "Q6B822_MOUSE",
416 |       "PRM2_MOUSE",
417 |       "H2BL1_MOUSE"
418 |     ]
419 |   },
420 |   "CAEEL": {
421 |     "name": "caeel",
422 |     "genome_size": 104000000,
423 |     "histone_proteins": [
424 |       "P10771",
425 |       "P15796",
426 |       "Q19743",
427 |       "O17536",
428 |       "O01833",
429 |       "Q9U3W3",
430 |       "Q18336",
431 |       "P09588",
432 |       "J7S164",
433 |       "J7SA65",
434 |       "Q27485",
435 |       "Q23429",
436 |       "Q27511",
437 |       "P04255",
438 |       "Q27894",
439 |       "P08898",
440 |       "K7ZUH9",
441 |       "Q10453",
442 |       "Q9U281",
443 |       "Q27490",
444 |       "Q27532",
445 |       "P62784",
446 |       "Q27484",
447 |       "Q27876",
448 |       "O16277",
449 |       "Q27489"
450 |     ],
451 |     "histone_entries": [
452 |       "H24_CAEEL",
453 |       "H12_CAEEL",
454 |       "H13_CAEEL",
455 |       "H14_CAEEL",
456 |       "H15_CAEEL",
457 |       "Q9U3W3_CAEEL",
458 |       "H1X_CAEEL",
459 |       "H2A_CAEEL",
460 |       "J7S164_CAEEL",
461 |       "J7SA65_CAEEL",
462 |       "Q27485_CAEEL",
463 |       "Q23429_CAEEL",
464 |       "H2AV_CAEEL",
465 |       "H2B1_CAEEL",
466 |       "H2B2_CAEEL",
467 |       "H3_CAEEL",
468 |       "K7ZUH9_CAEEL",
469 |       "H331_CAEEL",
470 |       "H332_CAEEL",
471 |       "H33L1_CAEEL",
472 |       "H33L2_CAEEL",
473 |       "H4_CAEEL",
474 |       "H2B3_CAEEL",
475 |       "H2B4_CAEEL",
476 |       "H16_CAEEL",
477 |       "H33L3_CAEEL"
478 |     ]
479 |   },
480 |   "YEAST": {
481 |     "name": "yeast",
482 |     "genome_size": 12100000,
483 |     "histone_proteins": [
484 |       "P53551",
485 |       "P04911",
486 |       "P04912",
487 |       "Q12692",
488 |       "P02293",
489 |       "P02294",
490 |       "P61830",
491 |       "P02309"
492 |     ],
493 |     "histone_entries": [
494 |       "H1_YEAST",
495 |       "H2A1_YEAST",
496 |       "H2A2_YEAST",
497 |       "H2AZ_YEAST",
498 |       "H2B1_YEAST",
499 |       "H2B2_YEAST",
500 |       "H3_YEAST",
501 |       "H4_YEAST"
502 |     ]
503 |   },
504 |   "SCHPO": {
505 |     "name": "schpo",
506 |     "genome_size": 14100000,
507 |     "histone_proteins": [
508 |       "P48003",
509 |       "P04909",
510 |       "P04910",
511 |       "P04913",
512 |       "P09988",
513 |       "P10651",
514 |       "P09322"
515 |     ],
516 |     "histone_entries": [
517 |       "H2AZ_SCHPO",
518 |       "H2A1_SCHPO",
519 |       "H2A2_SCHPO",
520 |       "H2B1_SCHPO",
521 |       "H31_SCHPO",
522 |       "H33_SCHPO",
523 |       "H4_SCHPO"
524 |     ]
525 |   }
526 | }
527 | 


--------------------------------------------------------------------------------
/data/ibaqpy.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/data/ibaqpy.drawio.png


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
 1 | # You can use this file to create a conda environment for this pipeline:
 2 | #   conda env create -f environment.yml
 3 | name: ibaqpy
 4 | channels:
 5 |   - openms
 6 |   - conda-forge
 7 |   - bioconda
 8 | dependencies:
 9 |   - python>=3.9
10 |   - scikit-learn
11 |   - pyopenms
12 |   - numpy<2.1.0
13 |   - click
14 |   - pandas
15 |   - matplotlib
16 |   - pyarrow>=16.1.0
17 |   - duckdb>=0.10.1
18 |   - qnorm
19 |   - scipy>=1.10
20 |   - seaborn>=0.13.2
21 |   - typing_extensions>=4.6.3
22 |   - inmoose
23 | 


--------------------------------------------------------------------------------
/ibaqpy/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ibaqpy - A Python package for iBAQ (intensity-based absolute quantification) analysis.
 3 | 
 4 | This package provides tools for processing and analyzing proteomics data using
 5 | the iBAQ method, which allows for absolute quantification of proteins.
 6 | """
 7 | 
 8 | import warnings
 9 | 
10 | # Suppress numpy matrix deprecation warning
11 | warnings.filterwarnings(
12 |     "ignore", category=PendingDeprecationWarning, module="numpy.matrixlib.defmatrix"
13 | )
14 | 
15 | __version__ = "0.0.5"
16 | 
17 | # Import logging configuration
18 | from ibaqpy.ibaq.logging_config import initialize_logging
19 | 
20 | # Initialize logging with default settings
21 | # Users can override these settings by calling initialize_logging with their own settings
22 | initialize_logging()
23 | 


--------------------------------------------------------------------------------
/ibaqpy/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/ibaqpy/commands/__init__.py


--------------------------------------------------------------------------------
/ibaqpy/commands/correct_batches.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import re
  3 | from pathlib import Path
  4 | from typing import Union
  5 | 
  6 | import click
  7 | import pandas as pd
  8 | 
  9 | from ibaqpy.ibaq.file_utils import create_anndata, combine_ibaq_tsv_files
 10 | from ibaqpy.ibaq.ibaqpy_commons import SAMPLE_ID_REGEX, SAMPLE_ID, PROTEIN_NAME, IBAQ, IBAQ_BEC
 11 | from ibaqpy.ibaq.ibaqpy_postprocessing import (
 12 |     pivot_wider,
 13 |     pivot_longer,
 14 | )
 15 | from ibaqpy.ibaq.utils import apply_batch_correction
 16 | 
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | logger.addHandler(logging.NullHandler())
 20 | 
 21 | 
 22 | def is_valid_sample_id(
 23 |     samples: Union[str, list, pd.Series], sample_id_pattern: str = SAMPLE_ID_REGEX
 24 | ) -> bool:
 25 |     """
 26 |     Validate sample IDs against a specified pattern.
 27 | 
 28 |     This function checks whether the provided sample IDs match a given regex pattern.
 29 |     It accepts a single sample ID, a list of sample IDs, or a pandas Series of sample IDs.
 30 |     If any sample ID does not match the pattern, it prints the invalid IDs and returns False.
 31 |     Otherwise, it returns True.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     samples : Union[str, list, pd.Series]
 36 |         The sample ID(s) to validate.
 37 |     sample_id_pattern : str, optional
 38 |         The regex pattern to validate the sample IDs against. Defaults to 'SAMPLE_ID_REGEX'.
 39 | 
 40 |     Returns
 41 |     -------
 42 |     bool
 43 |         True if all sample IDs are valid, False otherwise.
 44 |     """
 45 |     sample_pattern = re.compile(sample_id_pattern)
 46 | 
 47 |     # Ensure samples is a list for uniform processing
 48 |     if isinstance(samples, str):
 49 |         samples = [samples]
 50 |     elif isinstance(samples, pd.Series):
 51 |         samples = samples.tolist()
 52 | 
 53 |     # Identify invalid sample names.
 54 |     invalid_samples = [sample for sample in samples if not sample_pattern.fullmatch(sample)]
 55 | 
 56 |     if invalid_samples:
 57 |         logger.error("The following sample IDs are invalid:")
 58 |         for invalid_sample in invalid_samples:
 59 |             logger.error(f" - {invalid_sample}")
 60 |         return False
 61 |     return True
 62 | 
 63 | 
 64 | def get_batch_id_from_sample_names(samples: list) -> list:
 65 |     """
 66 |     Extract batch IDs from a list of sample names.
 67 | 
 68 |     Each sample name is expected to have a batch ID as a prefix, separated by a hyphen.
 69 |     The function validates that the batch ID consists of alphanumeric characters only.
 70 |     Returns a list of unique batch IDs as integer factors.
 71 | 
 72 |     Parameters
 73 |     ----------
 74 |     samples : list
 75 |         A list of sample names, each containing a batch ID prefix.
 76 | 
 77 |     Returns
 78 |     -------
 79 |     list
 80 |         A list of integer factors representing unique batch IDs.
 81 | 
 82 |     Raises
 83 |     ------
 84 |     ValueError
 85 |         If a sample name does not contain a valid batch ID prefix or if the
 86 |         batch ID contains non-alphanumeric characters.
 87 |     """
 88 |     batch_ids = []
 89 |     for sample in samples:
 90 |         parts = sample.split("-")
 91 |         if not parts or not parts[0]:
 92 |             raise ValueError(f"Invalid sample name format: {sample}. Expected batch-id prefix.")
 93 |         batch_id = parts[0]
 94 |         if not re.match(r"^[A-Za-z0-9]+$", batch_id):
 95 |             raise ValueError(
 96 |                 f"Invalid batch ID format: {batch_id}. Expected alphanumeric characters only."
 97 |             )
 98 |         batch_ids.append(batch_id)
 99 |     return pd.factorize(batch_ids)[0]
100 | 
101 | 
102 | def run_batch_correction(
103 |     folder: str,
104 |     pattern: str,
105 |     comment: str,
106 |     sep: str,
107 |     output: str,
108 |     sample_id_column: str = SAMPLE_ID,
109 |     protein_id_column: str = PROTEIN_NAME,
110 |     ibaq_raw_column: str = IBAQ,
111 |     ibaq_corrected_column: str = IBAQ_BEC,
112 |     export_anndata: bool = False,
113 | ) -> pd.DataFrame:
114 |     """
115 |     Run batch correction on iBAQ data from TSV files in a specified directory.
116 | 
117 |     This function combines multiple TSV files, reshapes the data, validates sample IDs,
118 |     applies batch correction, and optionally exports the results to an AnnData object.
119 | 
120 |     Parameters
121 |     ----------
122 |     folder : str
123 |         Directory containing the TSV files.
124 |     pattern : str
125 |         Pattern to match files in the directory.
126 |     comment : str
127 |         Character indicating the start of a comment line in the TSV files.
128 |     sep : str
129 |         Delimiter for reading the TSV files.
130 |     output : str
131 |         File path to save the corrected iBAQ values.
132 |     sample_id_column : str, optional
133 |         Column name for sample IDs. Defaults to 'SAMPLE_ID'.
134 |     protein_id_column : str, optional
135 |         Column name for protein IDs. Defaults to 'PROTEIN_NAME'.
136 |     ibaq_raw_column : str, optional
137 |         Column name for raw iBAQ values. Defaults to 'IBAQ'.
138 |     ibaq_corrected_column : str, optional
139 |         Column name for corrected iBAQ values. Defaults to 'IBAQ_BEC'.
140 |     export_anndata : bool, optional
141 |         Whether to export the data to an AnnData object. Defaults to False.
142 | 
143 |     Returns
144 |     -------
145 |     pd.DataFrame
146 |         DataFrame containing the original and corrected iBAQ values.
147 | 
148 |     Raises
149 |     ------
150 |     ValueError
151 |         If input files cannot be loaded, sample IDs are invalid, or output file cannot be saved.
152 |     FileNotFoundError
153 |         If the output file does not exist when exporting to AnnData.
154 |     """
155 | 
156 |     # Load the data
157 |     logger.info(f"Loading iBAQ data from TSV files in folder '{folder}'")
158 | 
159 |     try:
160 |         df_ibaq = combine_ibaq_tsv_files(folder, pattern=pattern, comment=comment, sep=sep)
161 |     except Exception as e:
162 |         raise ValueError(f"Failed to load input files: {str(e)}")
163 | 
164 |     # Reshape the data to wide format
165 |     df_wide = pivot_wider(
166 |         df_ibaq,
167 |         row_name=protein_id_column,
168 |         col_name=sample_id_column,
169 |         values=ibaq_raw_column,
170 |         fillna=True,
171 |     )
172 | 
173 |     # Validate the sample IDs
174 |     if not is_valid_sample_id(df_wide.columns, SAMPLE_ID_REGEX):
175 |         raise ValueError("Invalid sample IDs found in the data.")
176 | 
177 |     # Get the batch IDs
178 |     batch_ids = get_batch_id_from_sample_names(df_wide.columns)
179 | 
180 |     # Run batch correction
181 |     logger.info("Applying batch correction to iBAQ values")
182 |     df_corrected = apply_batch_correction(df_wide, list(batch_ids), kwargs={})
183 | 
184 |     # Convert the data back to long format
185 |     df_corrected = df_corrected.reset_index()
186 |     df_corrected_long = pivot_longer(
187 |         df_corrected,
188 |         row_name=protein_id_column,
189 |         col_name=sample_id_column,
190 |         values=ibaq_corrected_column,
191 |     )
192 | 
193 |     # Add the corrected ibaq values to the original dataframe.
194 |     # Use sample/protein ID keys to merge the dataframes.
195 |     df_ibaq = df_ibaq.merge(
196 |         df_corrected_long, how="left", on=[sample_id_column, protein_id_column]
197 |     )
198 | 
199 |     # Save the corrected iBAQ values to a file
200 |     if output:
201 |         try:
202 |             df_ibaq.to_csv(output, sep=sep, index=False)
203 |         except Exception as e:
204 |             raise ValueError(f"Failed to save output file: {str(e)}")
205 | 
206 |     # Export the raw and corrected iBAQ values to an AnnData object
207 |     if export_anndata:
208 |         logger.info("Exporting raw and corrected iBAQ values to an AnnData object")
209 |         output_path = Path(output)
210 |         if not output_path.exists():
211 |             raise FileNotFoundError(f"Output file {output} does not exist!")
212 |         adata = create_anndata(
213 |             df_ibaq,
214 |             obs_col=sample_id_column,
215 |             var_col=protein_id_column,
216 |             value_col=ibaq_raw_column,
217 |             layer_cols=[ibaq_corrected_column],
218 |         )
219 |         adata_filename = output_path.with_suffix(".h5ad")
220 |         try:
221 |             adata.write(adata_filename)
222 |         except Exception as e:
223 |             raise ValueError(f"Failed to write AnnData object: {e}")
224 | 
225 |     logger.info("Batch correction completed...")
226 | 
227 |     return df_ibaq
228 | 
229 | 
230 | @click.command("correct-batches", short_help="Batch effect correction for iBAQ values.")
231 | @click.option(
232 |     "-f",
233 |     "--folder",
234 |     help="Folder that contains all TSV files with raw iBAQ values",
235 |     required=True,
236 |     default=None,
237 | )
238 | @click.option(
239 |     "-p",
240 |     "--pattern",
241 |     help="Pattern for the TSV files with raw iBAQ values",
242 |     required=True,
243 |     default="*ibaq.tsv",
244 | )
245 | @click.option(
246 |     "--comment",
247 |     help="Comment character for the TSV files. Lines starting with this character will be ignored.",
248 |     required=False,
249 |     default="#",
250 | )
251 | @click.option("--sep", help="Separator for the TSV files", required=False, default="\t")
252 | @click.option(
253 |     "-o",
254 |     "--output",
255 |     help="Output file name for the combined iBAQ corrected values",
256 |     required=True,
257 | )
258 | @click.option(
259 |     "-sid",
260 |     "--sample_id_column",
261 |     help="Sample ID column name",
262 |     required=False,
263 |     default=SAMPLE_ID,
264 | )
265 | @click.option(
266 |     "-pid",
267 |     "--protein_id_column",
268 |     help="Protein ID column name",
269 |     required=False,
270 |     default=PROTEIN_NAME,
271 | )
272 | @click.option(
273 |     "-ibaq", "--ibaq_raw_column", help="Name of the raw iBAQ column", required=False, default=IBAQ
274 | )
275 | @click.option(
276 |     "--ibaq_corrected_column",
277 |     help="Name for the corrected iBAQ column",
278 |     required=False,
279 |     default=IBAQ_BEC,
280 | )
281 | @click.option(
282 |     "--export_anndata",
283 |     help="Export the raw and corrected iBAQ values to an AnnData object",
284 |     is_flag=True,
285 | )
286 | @click.pass_context
287 | def correct_batches(
288 |     ctx,
289 |     folder: str,
290 |     pattern: str,
291 |     comment: str,
292 |     sep: str,
293 |     output: str,
294 |     sample_id_column: str,
295 |     protein_id_column: str,
296 |     ibaq_raw_column: str,
297 |     ibaq_corrected_column: str,
298 |     export_anndata: bool,
299 | ):
300 |     """
301 |     Correcting batch effects in iBAQ data.
302 | 
303 |     This command processes TSV files containing raw iBAQ values, applies batch correction,
304 |     and outputs the corrected values. It supports various options for specifying file patterns,
305 |     column names, and output formats, including exporting to an AnnData file.
306 |     """
307 |     run_batch_correction(
308 |         folder=folder,
309 |         pattern=pattern,
310 |         comment=comment,
311 |         sep=sep,
312 |         output=output,
313 |         sample_id_column=sample_id_column,
314 |         protein_id_column=protein_id_column,
315 |         ibaq_raw_column=ibaq_raw_column,
316 |         ibaq_corrected_column=ibaq_corrected_column,
317 |         export_anndata=export_anndata,
318 |     )
319 | 


--------------------------------------------------------------------------------
/ibaqpy/commands/features2peptides.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | 
  3 | from ibaqpy.ibaq.peptide_normalization import peptide_normalization
  4 | from ibaqpy.model.normalization import FeatureNormalizationMethod, PeptideNormalizationMethod
  5 | 
  6 | 
  7 | @click.command("features2peptides", short_help="Convert features to parquet file.")
  8 | @click.option(
  9 |     "-p",
 10 |     "--parquet",
 11 |     help="Parquet file import generated by quantms.io",
 12 |     required=True,
 13 |     type=click.Path(exists=True),
 14 | )
 15 | @click.option(
 16 |     "-s", "--sdrf", help="SDRF file import generated by quantms", default=None, type=click.Path()
 17 | )
 18 | @click.option(
 19 |     "--min_aa", help="Minimum number of amino acids to filter peptides", type=int, default=7
 20 | )
 21 | @click.option(
 22 |     "--min_unique",
 23 |     help="Minimum number of unique peptides to filter proteins",
 24 |     default=2,
 25 |     type=int,
 26 | )
 27 | @click.option(
 28 |     "--remove_ids",
 29 |     help="Remove specific protein ids from the analysis using a file with one id per line",
 30 |     type=click.Path(exists=True),
 31 | )
 32 | @click.option(
 33 |     "--remove_decoy_contaminants",
 34 |     help="Remove decoy and contaminants proteins from the analysis",
 35 |     is_flag=True,
 36 |     default=False,
 37 | )
 38 | @click.option(
 39 |     "--remove_low_frequency_peptides",
 40 |     help="Remove peptides that are present in less than 20% of the samples",
 41 |     is_flag=True,
 42 |     default=False,
 43 | )
 44 | @click.option(
 45 |     "-o",
 46 |     "--output",
 47 |     help="Peptide intensity file including other all properties for normalization",
 48 |     type=click.Path(),
 49 | )
 50 | @click.option("--skip_normalization", help="Skip normalization step", is_flag=True, default=False)
 51 | @click.option(
 52 |     "--nmethod",
 53 |     help="Normalization method used to normalize feature intensities for tec (options: mean, median, iqr, none)",
 54 |     default="median",
 55 |     type=click.Choice([f.name.lower() for f in FeatureNormalizationMethod], case_sensitive=False),
 56 | )
 57 | @click.option(
 58 |     "--pnmethod",
 59 |     help="Normalization method used to normalize peptides intensities for all samples (options:globalMedian, conditionMedian)",
 60 |     default="globalMedian",
 61 |     type=click.Choice([p.name.lower() for p in PeptideNormalizationMethod], case_sensitive=False),
 62 | )
 63 | @click.option(
 64 |     "--log2",
 65 |     help="Transform to log2 the peptide intensity values before normalization",
 66 |     is_flag=True,
 67 | )
 68 | @click.option(
 69 |     "--save_parquet",
 70 |     help="Save normalized peptides to parquet",
 71 |     is_flag=True,
 72 | )
 73 | @click.pass_context
 74 | def features2parquet(
 75 |     ctx,
 76 |     parquet: str,
 77 |     sdrf: str,
 78 |     min_aa: int,
 79 |     min_unique: int,
 80 |     remove_ids: str,
 81 |     remove_decoy_contaminants: bool,
 82 |     remove_low_frequency_peptides: bool,
 83 |     output: str,
 84 |     skip_normalization: bool,
 85 |     nmethod: str,
 86 |     pnmethod: str,
 87 |     log2: bool,
 88 |     save_parquet: bool,
 89 | ) -> None:
 90 |     """
 91 |     Convert feature data to a parquet file with optional normalization and filtering steps.
 92 |     """
 93 | 
 94 |     peptide_normalization(
 95 |         parquet=parquet,
 96 |         sdrf=sdrf,
 97 |         min_aa=min_aa,
 98 |         min_unique=min_unique,
 99 |         remove_ids=remove_ids,
100 |         remove_decoy_contaminants=remove_decoy_contaminants,
101 |         remove_low_frequency_peptides=remove_low_frequency_peptides,
102 |         output=output,
103 |         skip_normalization=skip_normalization,
104 |         nmethod=nmethod,
105 |         pnmethod=pnmethod,
106 |         log2=log2,
107 |         save_parquet=save_parquet,
108 |     )
109 | 


--------------------------------------------------------------------------------
/ibaqpy/commands/peptides2protein.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | from ibaqpy.ibaq.peptides2protein import peptides_to_protein
  3 | from ibaqpy.model.organism_metadata import OrganismDescription
  4 | 
  5 | 
  6 | @click.command("peptides2protein", short_help="Compute IBAQ values for proteins")
  7 | @click.option(
  8 |     "-f",
  9 |     "--fasta",
 10 |     help="Protein database to compute IBAQ values",
 11 |     required=True,
 12 |     type=click.Path(exists=True),
 13 | )
 14 | @click.option(
 15 |     "-p",
 16 |     "--peptides",
 17 |     help="Peptide identifications with intensities following the peptide intensity output",
 18 |     required=True,
 19 |     type=click.Path(exists=True),
 20 | )
 21 | @click.option(
 22 |     "-e",
 23 |     "--enzyme",
 24 |     help="Enzyme used during the analysis of the dataset (default: Trypsin)",
 25 |     default="Trypsin",
 26 | )
 27 | @click.option(
 28 |     "-n",
 29 |     "--normalize",
 30 |     help="Normalize IBAQ values using by using the total IBAQ of the experiment",
 31 |     is_flag=True,
 32 | )
 33 | @click.option("--min_aa", help="Minimum number of amino acids to consider a peptide", default=7)
 34 | @click.option("--max_aa", help="Maximum number of amino acids to consider a peptide", default=30)
 35 | @click.option("-t", "--tpa", help="Whether calculate TPA", is_flag=True)
 36 | @click.option("-r", "--ruler", help="Whether to use ProteomicRuler", is_flag=True)
 37 | @click.option("-i", "--ploidy", help="Ploidy number (default: 2)", default=2)
 38 | @click.option(
 39 |     "-m",
 40 |     "--organism",
 41 |     help="Organism source of the data (default: human)",
 42 |     type=click.Choice(
 43 |         sorted(map(str.lower, OrganismDescription.registered_organisms())), case_sensitive=False
 44 |     ),
 45 |     default="human",
 46 | )
 47 | @click.option(
 48 |     "-c", "--cpc", help="Cellular protein concentration(g/L) (default: 200)", default=200
 49 | )
 50 | @click.option("-o", "--output", help="Output file with the proteins and ibaq values")
 51 | @click.option(
 52 |     "--verbose",
 53 |     help="Print addition information about the distributions of the intensities, number of peptides remove "
 54 |     "after normalization, etc.",
 55 |     is_flag=True,
 56 | )
 57 | @click.option(
 58 |     "--qc_report",
 59 |     help="PDF file to store multiple QC images",
 60 |     default="QCprofile.pdf",
 61 | )
 62 | @click.pass_context
 63 | def peptides2protein(
 64 |     click_context,
 65 |     fasta: str,
 66 |     peptides: str,
 67 |     enzyme: str,
 68 |     normalize: bool,
 69 |     min_aa: int,
 70 |     max_aa: int,
 71 |     tpa: bool,
 72 |     ruler: bool,
 73 |     organism: str,
 74 |     ploidy: int,
 75 |     cpc: float,
 76 |     output: str,
 77 |     verbose: bool,
 78 |     qc_report: str,
 79 | ) -> None:
 80 |     """
 81 |     Compute IBAQ values for proteins from peptide intensity data.
 82 | 
 83 |     This command processes peptide identifications and computes IBAQ values,
 84 |     optionally normalizing the data and calculating protein metrics using a
 85 |     proteomic ruler approach. It supports generating a QC report with distribution
 86 |     plots if verbose mode is enabled.
 87 |     """
 88 |     peptides_to_protein(
 89 |         fasta=fasta,
 90 |         peptides=peptides,
 91 |         enzyme=enzyme,
 92 |         normalize=normalize,
 93 |         min_aa=min_aa,
 94 |         max_aa=max_aa,
 95 |         tpa=tpa,
 96 |         ruler=ruler,
 97 |         ploidy=ploidy,
 98 |         cpc=cpc,
 99 |         organism=organism,
100 |         output=output,
101 |         verbose=verbose,
102 |         qc_report=qc_report,
103 |     )
104 | 


--------------------------------------------------------------------------------
/ibaqpy/commands/tsne_visualization.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import logging
  3 | 
  4 | import click
  5 | import matplotlib.pyplot as plt
  6 | import numpy as np
  7 | import pandas as pd
  8 | import seaborn as sns
  9 | from sklearn.decomposition import PCA
 10 | from sklearn.manifold import TSNE
 11 | 
 12 | from ibaqpy.ibaq.ibaqpy_commons import PROTEIN_NAME, SAMPLE_ID, IBAQ_LOG
 13 | 
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | logger.addHandler(logging.NullHandler())
 17 | 
 18 | 
 19 | # function to compute principal components
 20 | def compute_pca(df, n_components=5) -> pd.DataFrame:
 21 |     """
 22 |     Compute principal components for a given dataframe.
 23 | 
 24 |     Parameters:
 25 |     df : pd.DataFrame
 26 |         Input dataframe with samples as rows and features as columns.
 27 |     n_components : int, optional
 28 |         The number of principal components to compute (default is 5).
 29 |     Returns:
 30 |         df_pca : pd.DataFrame
 31 |         A dataframe with the principal components.
 32 |     """
 33 | 
 34 |     pca = PCA(n_components=n_components)
 35 |     pca.fit(df)
 36 |     df_pca = pca.transform(df)
 37 | 
 38 |     df_pca = pd.DataFrame(
 39 |         df_pca, index=df.index, columns=[f"PC{i}" for i in range(1, n_components + 1)]
 40 |     )
 41 | 
 42 |     plt.rcParams["figure.figsize"] = (12, 6)
 43 | 
 44 |     fig, ax = plt.subplots()
 45 |     xi = np.arange(1, n_components + 1, step=1)
 46 |     y = np.cumsum(pca.explained_variance_ratio_)
 47 | 
 48 |     plt.ylim(0.0, 1.1)
 49 |     plt.plot(xi, y, marker="o", linestyle="--", color="b")
 50 | 
 51 |     plt.xlabel("Number of Components")
 52 |     plt.xticks(
 53 |         np.arange(0, n_components, step=1)
 54 |     )  # change from 0-based array index to 1-based human-readable label
 55 |     plt.ylabel("Cumulative variance (%)")
 56 |     plt.title("The number of components needed to explain variance")
 57 | 
 58 |     plt.axhline(y=0.95, color="r", linestyle="-")
 59 |     plt.text(0.5, 0.85, "95% cut-off threshold", color="red", fontsize=16)
 60 | 
 61 |     ax.grid(axis="x")
 62 |     plt.show()
 63 | 
 64 |     return df_pca
 65 | 
 66 | 
 67 | def compute_tsne(df_pca, n_components=2, perplexity=30, learning_rate=200, n_iter=2000):
 68 |     """
 69 |     Compute t-SNE components from PCA components.
 70 | 
 71 |     This function applies t-SNE (t-Distributed Stochastic Neighbor Embedding) to the input DataFrame,
 72 |     which is expected to contain PCA components with samples as rows. The output is another DataFrame
 73 |     that contains t-SNE components, also with samples as rows.
 74 | 
 75 |     Parameters:
 76 |     df_pca : pandas DataFrame
 77 |         Input DataFrame containing PCA components. Rows are samples and columns are PCA components.
 78 |     n_components : int, optional
 79 |         The number of dimensions for the t-SNE components (default is 2).
 80 |     perplexity : float, optional
 81 |         The perplexity parameter for t-SNE, which can influence the balance between maintaining
 82 |         the local and global structure of the data (default is 30).
 83 |     learning_rate : float, optional
 84 |         The learning rate for t-SNE (default is 200).
 85 |     n_iter : int, optional
 86 |         The number of iterations for t-SNE optimization (default is 2000).
 87 | 
 88 |     Returns:
 89 |     df_tsne : pandas DataFrame
 90 |         Output DataFrame containing t-SNE components. Rows are samples and columns are t-SNE components.
 91 | 
 92 |     Example
 93 |     -------
 94 |      df_pca = pd.DataFrame(data, columns=['PC1', 'PC2', 'PC3'])
 95 |      df_tsne = compute_tsne(df_pca)
 96 |     """
 97 | 
 98 |     tsne = TSNE(
 99 |         n_components=n_components,
100 |         perplexity=perplexity,
101 |         learning_rate=learning_rate,
102 |         n_iter=n_iter,
103 |     )
104 |     tsne_results = tsne.fit_transform(np.asarray(df_pca))
105 | 
106 |     tsne_cols = [f"tSNE{i + 1}" for i in range(n_components)]
107 | 
108 |     df_tsne = pd.DataFrame(data=tsne_results, columns=tsne_cols)
109 |     df_tsne.index = df_pca.index
110 |     return df_tsne
111 | 
112 | 
113 | def plot_tsne(df, x_col, y_col, hue_col, file_name):
114 |     """
115 |     Generate and save a t-SNE scatter plot from a DataFrame.
116 | 
117 |     This function creates a scatter plot using seaborn's scatterplot function,
118 |     with the specified columns for the x-axis, y-axis, and hue. The plot is
119 |     customized with labels, a title, and a legend positioned inside the plot.
120 |     The resulting plot is saved to the specified file.
121 | 
122 |     Parameters:
123 |         df (pd.DataFrame): The DataFrame containing the data to plot.
124 |         x_col (str): The column name for the x-axis values.
125 |         y_col (str): The column name for the y-axis values.
126 |         hue_col (str): The column name for the hue (color) values.
127 |         file_name (str): The file path where the plot image will be saved.
128 |     """
129 |     fig, ax = plt.subplots(1, 1, figsize=(20, 10))
130 |     sns.scatterplot(x=x_col, y=y_col, hue=hue_col, data=df, ax=ax, markers=["o", "+", "x"])
131 |     ax.set_xlabel(x_col)
132 |     ax.set_ylabel(y_col)
133 |     ax.set_title(f"{x_col} vs {y_col} with {hue_col} information")
134 |     # set legend inside the plot left an upper corner
135 |     plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=8)
136 |     plt.subplots_adjust(right=0.8)
137 |     plt.savefig(file_name)
138 | 
139 | 
140 | @click.command()
141 | @click.option("-f", "--folder", help="Folder that contains all the protein files", required=True)
142 | @click.option(
143 |     "-o",
144 |     "--pattern",
145 |     help="Protein file pattern",
146 |     # TODO: I think we should use instead of pattern the structure of quantms.io for absolute quantification
147 |     required=False,
148 |     default="proteins.tsv",
149 | )
150 | def tsne_visualization(folder: str, pattern: str):
151 |     """
152 |     Generate a t-SNE visualization for protein data from specified files.
153 | 
154 |     This command-line tool reads protein data files from a specified folder,
155 |     applies PCA and t-SNE for dimensionality reduction, and generates a scatter
156 |     plot of the t-SNE components. The plot is saved as a PDF file.
157 | 
158 |     Parameters:
159 |         folder (str): The folder containing protein data files.
160 |         pattern (str): The file pattern to match protein files. Defaults to 'proteins.tsv'.
161 |     """
162 |     # get all the files in the folder
163 |     files = glob.glob(f"{folder}/*{pattern}")
164 | 
165 |     # get the files into pandas selected columns
166 |     # (Proteins accession, Sample ID, Reanalysis accession, Intensity)
167 | 
168 |     dfs = []  # list of dataframes
169 | 
170 |     for f in files:
171 |         reanalysis = (f.split("/")[-1].split("_")[0]).replace("-proteins.tsv", "")
172 |         dfs += [
173 |             pd.read_csv(f, usecols=[PROTEIN_NAME, SAMPLE_ID, IBAQ_LOG], sep=",").assign(
174 |                 reanalysis=reanalysis
175 |             )
176 |         ]
177 | 
178 |     total_proteins = pd.concat(dfs, ignore_index=True)
179 | 
180 |     normalize_df = pd.pivot_table(
181 |         total_proteins,
182 |         index=[SAMPLE_ID, "reanalysis"],
183 |         columns=PROTEIN_NAME,
184 |         values=IBAQ_LOG,
185 |     )
186 |     normalize_df = normalize_df.fillna(0)
187 |     df_pca = compute_pca(normalize_df, n_components=30)
188 |     df_tsne = compute_tsne(df_pca)
189 | 
190 |     batch = df_tsne.index.get_level_values("reanalysis").tolist()
191 |     df_tsne["batch"] = batch
192 | 
193 |     # plot the t-SNE components tSNE1 vs tSNE2 with batch information using seaborn
194 |     plot_tsne(df_tsne, "tSNE1", "tSNE2", "batch", "5.tsne_plot_with_batch_information.pdf")
195 | 
196 |     logger.info(total_proteins.shape)
197 | 
198 | 
199 | if __name__ == "__main__":
200 |     tsne_visualization()
201 | 


--------------------------------------------------------------------------------
/ibaqpy/data/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/ibaqpy/data/data.py:
--------------------------------------------------------------------------------
  1 | histones = {
  2 |     "HUMAN": {
  3 |         "name": "human",
  4 |         "genome_size": 3220000000,
  5 |         "histone_proteins": [
  6 |             "P07305",
  7 |             "Q8IZA3",
  8 |             "Q92522",
  9 |             "P0C5Y9",
 10 |             "P0C5Z0",
 11 |             "H0YFX9",
 12 |             "Q9BTM1",
 13 |             "A8MQC5",
 14 |             "C9J0D1",
 15 |             "C9J386",
 16 |             "E5RJU1",
 17 |             "Q71UI9",
 18 |             "P16104",
 19 |             "B4DJC3",
 20 |             "D6RCF2",
 21 |             "O75367",
 22 |             "Q5SQT3",
 23 |             "Q9P0M6",
 24 |             "P0C0S5",
 25 |             "P0C1H6",
 26 |             "A9UJN3",
 27 |             "P57053",
 28 |             "Q7Z2G1",
 29 |             "B4DEB1",
 30 |             "P84243",
 31 |             "B2R4P9",
 32 |             "K7EMV3",
 33 |             "K7ES00",
 34 |             "K7EK07",
 35 |             "K7EP01",
 36 |             "Q6NXT2",
 37 |             "Q02539",
 38 |             "P16401",
 39 |             "P16403",
 40 |             "P16402",
 41 |             "Q4VB24",
 42 |             "P10412",
 43 |             "A3R0T8",
 44 |             "A1L407",
 45 |             "P22492",
 46 |             "Q96QV6",
 47 |             "P04908",
 48 |             "Q08AJ9",
 49 |             "Q93077",
 50 |             "P20671",
 51 |             "P0C0S8",
 52 |             "A3KPC7",
 53 |             "Q96KK5",
 54 |             "Q99878",
 55 |             "A4FTV9",
 56 |             "Q92646",
 57 |             "Q96A08",
 58 |             "P33778",
 59 |             "P62807",
 60 |             "P58876",
 61 |             "B2R4S9",
 62 |             "Q93079",
 63 |             "P06899",
 64 |             "O60814",
 65 |             "Q99880",
 66 |             "I6L9F7",
 67 |             "Q99879",
 68 |             "Q99877",
 69 |             "P23527",
 70 |             "P68431",
 71 |             "P62805",
 72 |             "Q99525",
 73 |             "Q0VAS5",
 74 |             "B2R4R0",
 75 |             "Q6FI13",
 76 |             "Q8IUE6",
 77 |             "Q16777",
 78 |             "Q16778",
 79 |             "B4DR52",
 80 |             "Q5QNW6",
 81 |             "Q71DI3",
 82 |             "Q5TEC6",
 83 |             "Q7L7L0",
 84 |             "Q8N257",
 85 |             "Q16695",
 86 |             "Q6TXQ4",
 87 |             "Q14463",
 88 |             "B4E0B3",
 89 |             "B2R5B6",
 90 |             "A2RUA4",
 91 |             "B2R5B3",
 92 |             "Q9HA11",
 93 |             "A8K9J7",
 94 |             "B2R6Y1",
 95 |             "B4E380",
 96 |             "A8K4Y7",
 97 |             "Q6B823",
 98 |             "Q6LBZ2",
 99 |             "A3R0T7",
100 |         ],
101 |         "histone_entries": [
102 |             "H2AW_HUMAN",
103 |             "Q9HA11_HUMAN",
104 |             "H2AJ_HUMAN",
105 |             "H2B1L_HUMAN",
106 |             "H2B1M_HUMAN",
107 |             "H2A1J_HUMAN",
108 |             "H2B1N_HUMAN",
109 |             "H4G_HUMAN",
110 |             "H2A1A_HUMAN",
111 |             "H2A1H_HUMAN",
112 |             "H2B1A_HUMAN",
113 |             "H2B1H_HUMAN",
114 |             "H2A1C_HUMAN",
115 |             "Q92646_HUMAN",
116 |             "H1X_HUMAN",
117 |             "H2B3B_HUMAN",
118 |             "H18_HUMAN",
119 |             "H2A2B_HUMAN",
120 |             "H2BWT_HUMAN",
121 |             "H2A3_HUMAN",
122 |             "H2AV_HUMAN",
123 |             "H2AV_HUMAN",
124 |             "H32_HUMAN",
125 |             "Q6TXQ4_HUMAN",
126 |             "H3C_HUMAN",
127 |             "Q6LBZ2_HUMAN",
128 |             "H2A2A_HUMAN",
129 |             "Q6B823_HUMAN",
130 |             "H37_HUMAN",
131 |             "Q5SQT3_HUMAN",
132 |             "H2B2F_HUMAN",
133 |             "Q4VB24_HUMAN",
134 |             "H2B2E_HUMAN",
135 |             "H2A2C_HUMAN",
136 |             "H31T_HUMAN",
137 |             "Q14463_HUMAN",
138 |             "Q0VAS5_HUMAN",
139 |             "Q08AJ9_HUMAN",
140 |             "H11_HUMAN",
141 |             "H33_HUMAN",
142 |             "H31_HUMAN",
143 |             "H2B1C_HUMAN",
144 |             "H4_HUMAN",
145 |             "H2B1D_HUMAN",
146 |             "H2BFS_HUMAN",
147 |             "H2B1B_HUMAN",
148 |             "H2B1O_HUMAN",
149 |             "H1T_HUMAN",
150 |             "H2A1D_HUMAN",
151 |             "H12_HUMAN",
152 |             "H13_HUMAN",
153 |             "H15_HUMAN",
154 |             "H2AX_HUMAN",
155 |             "H14_HUMAN",
156 |             "H2AB2_HUMAN",
157 |             "H2AB1_HUMAN",
158 |             "H2BFM_HUMAN",
159 |             "H2A1_HUMAN",
160 |             "H2AZ_HUMAN",
161 |             "H10_HUMAN",
162 |             "H2B1J_HUMAN",
163 |             "H2A1B_HUMAN",
164 |             "H2AY_HUMAN",
165 |             "H2B1K_HUMAN",
166 |             "K7ES00_HUMAN",
167 |             "K7EP01_HUMAN",
168 |             "K7EMV3_HUMAN",
169 |             "K7EK07_HUMAN",
170 |             "I6L9F7_HUMAN",
171 |             "H0YFX9_HUMAN",
172 |             "E5RJU1_HUMAN",
173 |             "D6RCF2_HUMAN",
174 |             "C9J386_HUMAN",
175 |             "C9J0D1_HUMAN",
176 |             "B4E380_HUMAN",
177 |             "B4E0B3_HUMAN",
178 |             "B4DR52_HUMAN",
179 |             "B4DJC3_HUMAN",
180 |             "B4DEB1_HUMAN",
181 |             "B2R6Y1_HUMAN",
182 |             "B2R5B6_HUMAN",
183 |             "B2R5B3_HUMAN",
184 |             "B2R4S9_HUMAN",
185 |             "B2R4R0_HUMAN",
186 |             "B2R4P9_HUMAN",
187 |             "A9UJN3_HUMAN",
188 |             "A8K9J7_HUMAN",
189 |             "A8K4Y7_HUMAN",
190 |             "A4FTV9_HUMAN",
191 |             "A3R0T8_HUMAN",
192 |             "A3R0T7_HUMAN",
193 |             "A3KPC7_HUMAN",
194 |             "A2RUA4_HUMAN",
195 |             "A1L407_HUMAN",
196 |         ],
197 |     },
198 |     "MOUSE": {
199 |         "name": "mouse",
200 |         "genome_size": 2800000000,
201 |         "histone_proteins": [
202 |             "Q9DAD9",
203 |             "B2RTM0",
204 |             "Q8CBB6",
205 |             "Q921L4",
206 |             "Q5M8Q2",
207 |             "Q810S6",
208 |             "B1AV31",
209 |             "Q497L1",
210 |             "A9Z055",
211 |             "Q8CGP9",
212 |             "P10922",
213 |             "Q8CJI4",
214 |             "E0CZ52",
215 |             "E0CYL2",
216 |             "Q8VIK3",
217 |             "Q80ZM5",
218 |             "Q9CQ70",
219 |             "Q8R1M2",
220 |             "Q3THW5",
221 |             "Q8R029",
222 |             "B2RVP5",
223 |             "P27661",
224 |             "Q9QZQ8",
225 |             "Q8CA90",
226 |             "Q8BP16",
227 |             "Q9CTR1",
228 |             "Q8CCK0",
229 |             "Q9D3V6",
230 |             "Q9D3U7",
231 |             "Q3UA95",
232 |             "Q3TFU6",
233 |             "G3UWL7",
234 |             "G3UX40",
235 |             "P0C0S6",
236 |             "F8WI35",
237 |             "E0CZ27",
238 |             "E0CYN1",
239 |             "E0CYR7",
240 |             "P84244",
241 |             "P02301",
242 |             "Q9QYL0",
243 |             "P43275",
244 |             "P43276",
245 |             "P15864",
246 |             "Q5SZA3",
247 |             "P43277",
248 |             "Q149Z9",
249 |             "P43274",
250 |             "Q07133",
251 |             "I7HFT9",
252 |             "Q8CGP4",
253 |             "P22752",
254 |             "B2RVF0",
255 |             "Q61668",
256 |             "Q8CGP5",
257 |             "A0AUV1",
258 |             "Q8CGP6",
259 |             "A3KPD0",
260 |             "Q8CGP7",
261 |             "F8WIX8",
262 |             "A0JNS9",
263 |             "P70696",
264 |             "Q64475",
265 |             "Q6ZWY9",
266 |             "P10853",
267 |             "Q64478",
268 |             "A0JLV3",
269 |             "Q8CGP1",
270 |             "B2RVD5",
271 |             "P10854",
272 |             "B2RTK3",
273 |             "Q8CGP2",
274 |             "P68433",
275 |             "P84228",
276 |             "A1L0U3",
277 |             "A1L0V4",
278 |             "P62806",
279 |             "B2RWH3",
280 |             "Q6GSS7",
281 |             "Q64522",
282 |             "Q64523",
283 |             "Q149V4",
284 |             "Q64525",
285 |             "G3X9D5",
286 |             "Q64524",
287 |             "B9EI85",
288 |             "Q61667",
289 |             "Q8BFU2",
290 |             "A2AB79",
291 |             "Q9D2U9",
292 |             "Q8CGP0",
293 |             "Q6B822",
294 |             "P07978",
295 |             "Q9D9Z7",
296 |         ],
297 |     },
298 |     "DROME": {
299 |         "name": "drome",
300 |         "genome_size": 144000000,
301 |         "histone_proteins": [
302 |             "Q6TXQ1",
303 |             "P02255",
304 |             "Q4AB54",
305 |             "Q4ABE3",
306 |             "Q4ABD8",
307 |             "Q4AB94",
308 |             "P84051",
309 |             "Q4AB57",
310 |             "P08985",
311 |             "P02283",
312 |             "P02299",
313 |             "E2QCP0",
314 |             "P84249",
315 |             "P84040",
316 |         ],
317 |         "histone_entries": [
318 |             "Q9DAD9_MOUSE",
319 |             "B2RTM0_MOUSE",
320 |             "Q8CBB6_MOUSE",
321 |             "Q921L4_MOUSE",
322 |             "H2AL1_MOUSE",
323 |             "Q810S6_MOUSE",
324 |             "Q9DAD9_MOUSE",
325 |             "Q497L1_MOUSE",
326 |             "A9Z055_MOUSE",
327 |             "Q8CGP9_MOUSE",
328 |             "H10_MOUSE",
329 |             "H1FNT_MOUSE",
330 |             "E0CZ52_MOUSE",
331 |             "E0CYL2_MOUSE",
332 |             "H18_MOUSE",
333 |             "Q80ZM5_MOUSE",
334 |             "H2AB1_MOUSE",
335 |             "H2AJ_MOUSE",
336 |             "H2AV_MOUSE",
337 |             "Q8R029_MOUSE",
338 |             "B2RVP5_MOUSE",
339 |             "H2AX_MOUSE",
340 |             "H2AY_MOUSE",
341 |             "Q8CA90_MOUSE",
342 |             "Q8BP16_MOUSE",
343 |             "Q9CTR1_MOUSE",
344 |             "H2AW_MOUSE",
345 |             "Q9D3V6_MOUSE",
346 |             "Q9D3U7_MOUSE",
347 |             "Q3UA95_MOUSE",
348 |             "Q3TFU6_MOUSE",
349 |             "G3UWL7_MOUSE",
350 |             "G3UX40_MOUSE",
351 |             "H2AZ_MOUSE",
352 |             "F8WI35_MOUSE",
353 |             "E0CZ27_MOUSE",
354 |             "E0CYN1_MOUSE",
355 |             "E0CYR7_MOUSE",
356 |             "H33_MOUSE",
357 |             "H3C_MOUSE",
358 |             "HILS1_MOUSE",
359 |             "H11_MOUSE",
360 |             "H15_MOUSE",
361 |             "H12_MOUSE",
362 |             "Q5SZA3_MOUSE",
363 |             "H13_MOUSE",
364 |             "Q149Z9_MOUSE",
365 |             "H14_MOUSE",
366 |             "H1T_MOUSE",
367 |             "I7HFT9_MOUSE",
368 |             "Q8CGP4_MOUSE",
369 |             "H2A1P_MOUSE",
370 |             "H2A1B_MOUSE",
371 |             "H2A1C_MOUSE",
372 |             "H2A1D_MOUSE",
373 |             "H2A1E_MOUSE",
374 |             "H2A1G_MOUSE",
375 |             "H2A1I_MOUSE",
376 |             "H2A1N_MOUSE",
377 |             "H2A1O_MOUSE",
378 |             "B2RVF0_MOUSE",
379 |             "Q61668_MOUSE",
380 |             "H2A1F_MOUSE",
381 |             "A0AUV1_MOUSE",
382 |             "H2A1H_MOUSE",
383 |             "A3KPD0_MOUSE",
384 |             "H2A1K_MOUSE",
385 |             "A0JNS9_MOUSE",
386 |             "H2B1A_MOUSE",
387 |             "H2B1B_MOUSE",
388 |             "H2B1C_MOUSE",
389 |             "H2B1F_MOUSE",
390 |             "H2B1H_MOUSE",
391 |             "A0JLV3_MOUSE",
392 |             "H2B1K_MOUSE",
393 |             "B2RVD5_MOUSE",
394 |             "H2B1M_MOUSE",
395 |             "B2RTK3_MOUSE",
396 |             "H2B1P_MOUSE",
397 |             "H31_MOUSE",
398 |             "H32_MOUSE",
399 |             "A1L0U3_MOUSE",
400 |             "A1L0V4_MOUSE",
401 |             "H4_MOUSE",
402 |             "B2RWH3_MOUSE",
403 |             "H2A2A_MOUSE",
404 |             "H2A2B_MOUSE",
405 |             "H2A2C_MOUSE",
406 |             "Q149V4_MOUSE",
407 |             "H2B2B_MOUSE",
408 |             "H2B2E_MOUSE",
409 |             "B9EI85_MOUSE",
410 |             "Q61667_MOUSE",
411 |             "H2A3_MOUSE",
412 |             "A2AB79_MOUSE",
413 |             "H2B3A_MOUSE",
414 |             "H2B3B_MOUSE",
415 |             "Q6B822_MOUSE",
416 |             "PRM2_MOUSE",
417 |             "H2BL1_MOUSE",
418 |         ],
419 |     },
420 |     "CAEEL": {
421 |         "name": "caeel",
422 |         "genome_size": 104000000,
423 |         "histone_proteins": [
424 |             "P10771",
425 |             "P15796",
426 |             "Q19743",
427 |             "O17536",
428 |             "O01833",
429 |             "Q9U3W3",
430 |             "Q18336",
431 |             "P09588",
432 |             "J7S164",
433 |             "J7SA65",
434 |             "Q27485",
435 |             "Q23429",
436 |             "Q27511",
437 |             "P04255",
438 |             "Q27894",
439 |             "P08898",
440 |             "K7ZUH9",
441 |             "Q10453",
442 |             "Q9U281",
443 |             "Q27490",
444 |             "Q27532",
445 |             "P62784",
446 |             "Q27484",
447 |             "Q27876",
448 |             "O16277",
449 |             "Q27489",
450 |         ],
451 |         "histone_entries": [
452 |             "H24_CAEEL",
453 |             "H12_CAEEL",
454 |             "H13_CAEEL",
455 |             "H14_CAEEL",
456 |             "H15_CAEEL",
457 |             "Q9U3W3_CAEEL",
458 |             "H1X_CAEEL",
459 |             "H2A_CAEEL",
460 |             "J7S164_CAEEL",
461 |             "J7SA65_CAEEL",
462 |             "Q27485_CAEEL",
463 |             "Q23429_CAEEL",
464 |             "H2AV_CAEEL",
465 |             "H2B1_CAEEL",
466 |             "H2B2_CAEEL",
467 |             "H3_CAEEL",
468 |             "K7ZUH9_CAEEL",
469 |             "H331_CAEEL",
470 |             "H332_CAEEL",
471 |             "H33L1_CAEEL",
472 |             "H33L2_CAEEL",
473 |             "H4_CAEEL",
474 |             "H2B3_CAEEL",
475 |             "H2B4_CAEEL",
476 |             "H16_CAEEL",
477 |             "H33L3_CAEEL",
478 |         ],
479 |     },
480 |     "YEAST": {
481 |         "name": "yeast",
482 |         "genome_size": 12100000,
483 |         "histone_proteins": [
484 |             "P53551",
485 |             "P04911",
486 |             "P04912",
487 |             "Q12692",
488 |             "P02293",
489 |             "P02294",
490 |             "P61830",
491 |             "P02309",
492 |         ],
493 |         "histone_entries": [
494 |             "H1_YEAST",
495 |             "H2A1_YEAST",
496 |             "H2A2_YEAST",
497 |             "H2AZ_YEAST",
498 |             "H2B1_YEAST",
499 |             "H2B2_YEAST",
500 |             "H3_YEAST",
501 |             "H4_YEAST",
502 |         ],
503 |     },
504 |     "SCHPO": {
505 |         "name": "schpo",
506 |         "genome_size": 14100000,
507 |         "histone_proteins": ["P48003", "P04909", "P04910", "P04913", "P09988", "P10651", "P09322"],
508 |         "histone_entries": [
509 |             "H2AZ_SCHPO",
510 |             "H2A1_SCHPO",
511 |             "H2A2_SCHPO",
512 |             "H2B1_SCHPO",
513 |             "H31_SCHPO",
514 |             "H33_SCHPO",
515 |             "H4_SCHPO",
516 |         ],
517 |     },
518 | }
519 | 


--------------------------------------------------------------------------------
/ibaqpy/data/organisms.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "HUMAN": {
  3 |     "name": "human",
  4 |     "genome_size": 3220000000,
  5 |     "histone_proteins": [
  6 |       "P07305",
  7 |       "Q8IZA3",
  8 |       "Q92522",
  9 |       "P0C5Y9",
 10 |       "P0C5Z0",
 11 |       "H0YFX9",
 12 |       "Q9BTM1",
 13 |       "A8MQC5",
 14 |       "C9J0D1",
 15 |       "C9J386",
 16 |       "E5RJU1",
 17 |       "Q71UI9",
 18 |       "P16104",
 19 |       "B4DJC3",
 20 |       "D6RCF2",
 21 |       "O75367",
 22 |       "Q5SQT3",
 23 |       "Q9P0M6",
 24 |       "P0C0S5",
 25 |       "P0C1H6",
 26 |       "A9UJN3",
 27 |       "P57053",
 28 |       "Q7Z2G1",
 29 |       "B4DEB1",
 30 |       "P84243",
 31 |       "B2R4P9",
 32 |       "K7EMV3",
 33 |       "K7ES00",
 34 |       "K7EK07",
 35 |       "K7EP01",
 36 |       "Q6NXT2",
 37 |       "Q02539",
 38 |       "P16401",
 39 |       "P16403",
 40 |       "P16402",
 41 |       "Q4VB24",
 42 |       "P10412",
 43 |       "A3R0T8",
 44 |       "A1L407",
 45 |       "P22492",
 46 |       "Q96QV6",
 47 |       "P04908",
 48 |       "Q08AJ9",
 49 |       "Q93077",
 50 |       "P20671",
 51 |       "P0C0S8",
 52 |       "A3KPC7",
 53 |       "Q96KK5",
 54 |       "Q99878",
 55 |       "A4FTV9",
 56 |       "Q92646",
 57 |       "Q96A08",
 58 |       "P33778",
 59 |       "P62807",
 60 |       "P58876",
 61 |       "B2R4S9",
 62 |       "Q93079",
 63 |       "P06899",
 64 |       "O60814",
 65 |       "Q99880",
 66 |       "I6L9F7",
 67 |       "Q99879",
 68 |       "Q99877",
 69 |       "P23527",
 70 |       "P68431",
 71 |       "P62805",
 72 |       "Q99525",
 73 |       "Q0VAS5",
 74 |       "B2R4R0",
 75 |       "Q6FI13",
 76 |       "Q8IUE6",
 77 |       "Q16777",
 78 |       "Q16778",
 79 |       "B4DR52",
 80 |       "Q5QNW6",
 81 |       "Q71DI3",
 82 |       "Q5TEC6",
 83 |       "Q7L7L0",
 84 |       "Q8N257",
 85 |       "Q16695",
 86 |       "Q6TXQ4",
 87 |       "Q14463",
 88 |       "B4E0B3",
 89 |       "B2R5B6",
 90 |       "A2RUA4",
 91 |       "B2R5B3",
 92 |       "Q9HA11",
 93 |       "A8K9J7",
 94 |       "B2R6Y1",
 95 |       "B4E380",
 96 |       "A8K4Y7",
 97 |       "Q6B823",
 98 |       "Q6LBZ2",
 99 |       "A3R0T7"
100 |     ],
101 |     "histone_entries": [
102 |       "H2AW_HUMAN",
103 |       "Q9HA11_HUMAN",
104 |       "H2AJ_HUMAN",
105 |       "H2B1L_HUMAN",
106 |       "H2B1M_HUMAN",
107 |       "H2A1J_HUMAN",
108 |       "H2B1N_HUMAN",
109 |       "H4G_HUMAN",
110 |       "H2A1A_HUMAN",
111 |       "H2A1H_HUMAN",
112 |       "H2B1A_HUMAN",
113 |       "H2B1H_HUMAN",
114 |       "H2A1C_HUMAN",
115 |       "Q92646_HUMAN",
116 |       "H1X_HUMAN",
117 |       "H2B3B_HUMAN",
118 |       "H18_HUMAN",
119 |       "H2A2B_HUMAN",
120 |       "H2BWT_HUMAN",
121 |       "H2A3_HUMAN",
122 |       "H2AV_HUMAN",
123 |       "H2AV_HUMAN",
124 |       "H32_HUMAN",
125 |       "Q6TXQ4_HUMAN",
126 |       "H3C_HUMAN",
127 |       "Q6LBZ2_HUMAN",
128 |       "H2A2A_HUMAN",
129 |       "Q6B823_HUMAN",
130 |       "H37_HUMAN",
131 |       "Q5SQT3_HUMAN",
132 |       "H2B2F_HUMAN",
133 |       "Q4VB24_HUMAN",
134 |       "H2B2E_HUMAN",
135 |       "H2A2C_HUMAN",
136 |       "H31T_HUMAN",
137 |       "Q14463_HUMAN",
138 |       "Q0VAS5_HUMAN",
139 |       "Q08AJ9_HUMAN",
140 |       "H11_HUMAN",
141 |       "H33_HUMAN",
142 |       "H31_HUMAN",
143 |       "H2B1C_HUMAN",
144 |       "H4_HUMAN",
145 |       "H2B1D_HUMAN",
146 |       "H2BFS_HUMAN",
147 |       "H2B1B_HUMAN",
148 |       "H2B1O_HUMAN",
149 |       "H1T_HUMAN",
150 |       "H2A1D_HUMAN",
151 |       "H12_HUMAN",
152 |       "H13_HUMAN",
153 |       "H15_HUMAN",
154 |       "H2AX_HUMAN",
155 |       "H14_HUMAN",
156 |       "H2AB2_HUMAN",
157 |       "H2AB1_HUMAN",
158 |       "H2BFM_HUMAN",
159 |       "H2A1_HUMAN",
160 |       "H2AZ_HUMAN",
161 |       "H10_HUMAN",
162 |       "H2B1J_HUMAN",
163 |       "H2A1B_HUMAN",
164 |       "H2AY_HUMAN",
165 |       "H2B1K_HUMAN",
166 |       "K7ES00_HUMAN",
167 |       "K7EP01_HUMAN",
168 |       "K7EMV3_HUMAN",
169 |       "K7EK07_HUMAN",
170 |       "I6L9F7_HUMAN",
171 |       "H0YFX9_HUMAN",
172 |       "E5RJU1_HUMAN",
173 |       "D6RCF2_HUMAN",
174 |       "C9J386_HUMAN",
175 |       "C9J0D1_HUMAN",
176 |       "B4E380_HUMAN",
177 |       "B4E0B3_HUMAN",
178 |       "B4DR52_HUMAN",
179 |       "B4DJC3_HUMAN",
180 |       "B4DEB1_HUMAN",
181 |       "B2R6Y1_HUMAN",
182 |       "B2R5B6_HUMAN",
183 |       "B2R5B3_HUMAN",
184 |       "B2R4S9_HUMAN",
185 |       "B2R4R0_HUMAN",
186 |       "B2R4P9_HUMAN",
187 |       "A9UJN3_HUMAN",
188 |       "A8K9J7_HUMAN",
189 |       "A8K4Y7_HUMAN",
190 |       "A4FTV9_HUMAN",
191 |       "A3R0T8_HUMAN",
192 |       "A3R0T7_HUMAN",
193 |       "A3KPC7_HUMAN",
194 |       "A2RUA4_HUMAN",
195 |       "A1L407_HUMAN"
196 |     ]
197 |   },
198 |   "MOUSE": {
199 |     "name": "mouse",
200 |     "genome_size": 2800000000,
201 |     "histone_proteins": [
202 |       "Q9DAD9",
203 |       "B2RTM0",
204 |       "Q8CBB6",
205 |       "Q921L4",
206 |       "Q5M8Q2",
207 |       "Q810S6",
208 |       "B1AV31",
209 |       "Q497L1",
210 |       "A9Z055",
211 |       "Q8CGP9",
212 |       "P10922",
213 |       "Q8CJI4",
214 |       "E0CZ52",
215 |       "E0CYL2",
216 |       "Q8VIK3",
217 |       "Q80ZM5",
218 |       "Q9CQ70",
219 |       "Q8R1M2",
220 |       "Q3THW5",
221 |       "Q8R029",
222 |       "B2RVP5",
223 |       "P27661",
224 |       "Q9QZQ8",
225 |       "Q8CA90",
226 |       "Q8BP16",
227 |       "Q9CTR1",
228 |       "Q8CCK0",
229 |       "Q9D3V6",
230 |       "Q9D3U7",
231 |       "Q3UA95",
232 |       "Q3TFU6",
233 |       "G3UWL7",
234 |       "G3UX40",
235 |       "P0C0S6",
236 |       "F8WI35",
237 |       "E0CZ27",
238 |       "E0CYN1",
239 |       "E0CYR7",
240 |       "P84244",
241 |       "P02301",
242 |       "Q9QYL0",
243 |       "P43275",
244 |       "P43276",
245 |       "P15864",
246 |       "Q5SZA3",
247 |       "P43277",
248 |       "Q149Z9",
249 |       "P43274",
250 |       "Q07133",
251 |       "I7HFT9",
252 |       "Q8CGP4",
253 |       "P22752",
254 |       "B2RVF0",
255 |       "Q61668",
256 |       "Q8CGP5",
257 |       "A0AUV1",
258 |       "Q8CGP6",
259 |       "A3KPD0",
260 |       "Q8CGP7",
261 |       "F8WIX8",
262 |       "A0JNS9",
263 |       "P70696",
264 |       "Q64475",
265 |       "Q6ZWY9",
266 |       "P10853",
267 |       "Q64478",
268 |       "A0JLV3",
269 |       "Q8CGP1",
270 |       "B2RVD5",
271 |       "P10854",
272 |       "B2RTK3",
273 |       "Q8CGP2",
274 |       "P68433",
275 |       "P84228",
276 |       "A1L0U3",
277 |       "A1L0V4",
278 |       "P62806",
279 |       "B2RWH3",
280 |       "Q6GSS7",
281 |       "Q64522",
282 |       "Q64523",
283 |       "Q149V4",
284 |       "Q64525",
285 |       "G3X9D5",
286 |       "Q64524",
287 |       "B9EI85",
288 |       "Q61667",
289 |       "Q8BFU2",
290 |       "A2AB79",
291 |       "Q9D2U9",
292 |       "Q8CGP0",
293 |       "Q6B822",
294 |       "P07978",
295 |       "Q9D9Z7"
296 |     ]
297 |   },
298 |   "DROME": {
299 |     "name": "drome",
300 |     "genome_size": 144000000,
301 |     "histone_proteins": [
302 |       "Q6TXQ1",
303 |       "P02255",
304 |       "Q4AB54",
305 |       "Q4ABE3",
306 |       "Q4ABD8",
307 |       "Q4AB94",
308 |       "P84051",
309 |       "Q4AB57",
310 |       "P08985",
311 |       "P02283",
312 |       "P02299",
313 |       "E2QCP0",
314 |       "P84249",
315 |       "P84040"
316 |     ],
317 |     "histone_entries": [
318 |       "Q9DAD9_MOUSE",
319 |       "B2RTM0_MOUSE",
320 |       "Q8CBB6_MOUSE",
321 |       "Q921L4_MOUSE",
322 |       "H2AL1_MOUSE",
323 |       "Q810S6_MOUSE",
324 |       "Q9DAD9_MOUSE",
325 |       "Q497L1_MOUSE",
326 |       "A9Z055_MOUSE",
327 |       "Q8CGP9_MOUSE",
328 |       "H10_MOUSE",
329 |       "H1FNT_MOUSE",
330 |       "E0CZ52_MOUSE",
331 |       "E0CYL2_MOUSE",
332 |       "H18_MOUSE",
333 |       "Q80ZM5_MOUSE",
334 |       "H2AB1_MOUSE",
335 |       "H2AJ_MOUSE",
336 |       "H2AV_MOUSE",
337 |       "Q8R029_MOUSE",
338 |       "B2RVP5_MOUSE",
339 |       "H2AX_MOUSE",
340 |       "H2AY_MOUSE",
341 |       "Q8CA90_MOUSE",
342 |       "Q8BP16_MOUSE",
343 |       "Q9CTR1_MOUSE",
344 |       "H2AW_MOUSE",
345 |       "Q9D3V6_MOUSE",
346 |       "Q9D3U7_MOUSE",
347 |       "Q3UA95_MOUSE",
348 |       "Q3TFU6_MOUSE",
349 |       "G3UWL7_MOUSE",
350 |       "G3UX40_MOUSE",
351 |       "H2AZ_MOUSE",
352 |       "F8WI35_MOUSE",
353 |       "E0CZ27_MOUSE",
354 |       "E0CYN1_MOUSE",
355 |       "E0CYR7_MOUSE",
356 |       "H33_MOUSE",
357 |       "H3C_MOUSE",
358 |       "HILS1_MOUSE",
359 |       "H11_MOUSE",
360 |       "H15_MOUSE",
361 |       "H12_MOUSE",
362 |       "Q5SZA3_MOUSE",
363 |       "H13_MOUSE",
364 |       "Q149Z9_MOUSE",
365 |       "H14_MOUSE",
366 |       "H1T_MOUSE",
367 |       "I7HFT9_MOUSE",
368 |       "Q8CGP4_MOUSE",
369 |       "H2A1P_MOUSE",
370 |       "H2A1B_MOUSE",
371 |       "H2A1C_MOUSE",
372 |       "H2A1D_MOUSE",
373 |       "H2A1E_MOUSE",
374 |       "H2A1G_MOUSE",
375 |       "H2A1I_MOUSE",
376 |       "H2A1N_MOUSE",
377 |       "H2A1O_MOUSE",
378 |       "B2RVF0_MOUSE",
379 |       "Q61668_MOUSE",
380 |       "H2A1F_MOUSE",
381 |       "A0AUV1_MOUSE",
382 |       "H2A1H_MOUSE",
383 |       "A3KPD0_MOUSE",
384 |       "H2A1K_MOUSE",
385 |       "A0JNS9_MOUSE",
386 |       "H2B1A_MOUSE",
387 |       "H2B1B_MOUSE",
388 |       "H2B1C_MOUSE",
389 |       "H2B1F_MOUSE",
390 |       "H2B1H_MOUSE",
391 |       "A0JLV3_MOUSE",
392 |       "H2B1K_MOUSE",
393 |       "B2RVD5_MOUSE",
394 |       "H2B1M_MOUSE",
395 |       "B2RTK3_MOUSE",
396 |       "H2B1P_MOUSE",
397 |       "H31_MOUSE",
398 |       "H32_MOUSE",
399 |       "A1L0U3_MOUSE",
400 |       "A1L0V4_MOUSE",
401 |       "H4_MOUSE",
402 |       "B2RWH3_MOUSE",
403 |       "H2A2A_MOUSE",
404 |       "H2A2B_MOUSE",
405 |       "H2A2C_MOUSE",
406 |       "Q149V4_MOUSE",
407 |       "H2B2B_MOUSE",
408 |       "H2B2E_MOUSE",
409 |       "B9EI85_MOUSE",
410 |       "Q61667_MOUSE",
411 |       "H2A3_MOUSE",
412 |       "A2AB79_MOUSE",
413 |       "H2B3A_MOUSE",
414 |       "H2B3B_MOUSE",
415 |       "Q6B822_MOUSE",
416 |       "PRM2_MOUSE",
417 |       "H2BL1_MOUSE"
418 |     ]
419 |   },
420 |   "CAEEL": {
421 |     "name": "caeel",
422 |     "genome_size": 104000000,
423 |     "histone_proteins": [
424 |       "P10771",
425 |       "P15796",
426 |       "Q19743",
427 |       "O17536",
428 |       "O01833",
429 |       "Q9U3W3",
430 |       "Q18336",
431 |       "P09588",
432 |       "J7S164",
433 |       "J7SA65",
434 |       "Q27485",
435 |       "Q23429",
436 |       "Q27511",
437 |       "P04255",
438 |       "Q27894",
439 |       "P08898",
440 |       "K7ZUH9",
441 |       "Q10453",
442 |       "Q9U281",
443 |       "Q27490",
444 |       "Q27532",
445 |       "P62784",
446 |       "Q27484",
447 |       "Q27876",
448 |       "O16277",
449 |       "Q27489"
450 |     ],
451 |     "histone_entries": [
452 |       "H24_CAEEL",
453 |       "H12_CAEEL",
454 |       "H13_CAEEL",
455 |       "H14_CAEEL",
456 |       "H15_CAEEL",
457 |       "Q9U3W3_CAEEL",
458 |       "H1X_CAEEL",
459 |       "H2A_CAEEL",
460 |       "J7S164_CAEEL",
461 |       "J7SA65_CAEEL",
462 |       "Q27485_CAEEL",
463 |       "Q23429_CAEEL",
464 |       "H2AV_CAEEL",
465 |       "H2B1_CAEEL",
466 |       "H2B2_CAEEL",
467 |       "H3_CAEEL",
468 |       "K7ZUH9_CAEEL",
469 |       "H331_CAEEL",
470 |       "H332_CAEEL",
471 |       "H33L1_CAEEL",
472 |       "H33L2_CAEEL",
473 |       "H4_CAEEL",
474 |       "H2B3_CAEEL",
475 |       "H2B4_CAEEL",
476 |       "H16_CAEEL",
477 |       "H33L3_CAEEL"
478 |     ]
479 |   },
480 |   "YEAST": {
481 |     "name": "yeast",
482 |     "genome_size": 12100000,
483 |     "histone_proteins": [
484 |       "P53551",
485 |       "P04911",
486 |       "P04912",
487 |       "Q12692",
488 |       "P02293",
489 |       "P02294",
490 |       "P61830",
491 |       "P02309"
492 |     ],
493 |     "histone_entries": [
494 |       "H1_YEAST",
495 |       "H2A1_YEAST",
496 |       "H2A2_YEAST",
497 |       "H2AZ_YEAST",
498 |       "H2B1_YEAST",
499 |       "H2B2_YEAST",
500 |       "H3_YEAST",
501 |       "H4_YEAST"
502 |     ]
503 |   },
504 |   "SCHPO": {
505 |     "name": "schpo",
506 |     "genome_size": 14100000,
507 |     "histone_proteins": [
508 |       "P48003",
509 |       "P04909",
510 |       "P04910",
511 |       "P04913",
512 |       "P09988",
513 |       "P10651",
514 |       "P09322"
515 |     ],
516 |     "histone_entries": [
517 |       "H2AZ_SCHPO",
518 |       "H2A1_SCHPO",
519 |       "H2A2_SCHPO",
520 |       "H2B1_SCHPO",
521 |       "H31_SCHPO",
522 |       "H33_SCHPO",
523 |       "H4_SCHPO"
524 |     ]
525 |   }
526 | }
527 | 


--------------------------------------------------------------------------------
/ibaqpy/ibaq/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/ibaqpy/ibaq/__init__.py


--------------------------------------------------------------------------------
/ibaqpy/ibaq/combiner.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | from ibaqpy.ibaq.ibaqpy_commons import load_feature, load_sdrf
  9 | from ibaqpy.ibaq.imputation_methods import impute_missing_values
 10 | from ibaqpy.ibaq.utils import (
 11 |     compute_pca,
 12 |     get_batch_info_from_sample_names,
 13 |     generate_meta,
 14 |     folder_retrieval,
 15 |     filter_missing_value_by_group,
 16 |     split_df_by_column,
 17 |     fill_samples,
 18 |     iterative_outlier_removal,
 19 |     plot_pca,
 20 |     remove_single_sample_batches,
 21 |     apply_batch_correction,
 22 | )
 23 | 
 24 | logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG)
 25 | logger = logging.getLogger(__name__)
 26 | 
 27 | 
 28 | class Combiner:
 29 |     def __init__(self, data_folder: os.PathLike, covariate: str = None, organism: str = "HUMAN"):
 30 |         """
 31 |         Initialize a Combiner instance to process and combine SDRF and iBAQ data.
 32 | 
 33 |         Parameters:
 34 |         data_folder : os.PathLike
 35 |             Path to the folder containing SDRF and iBAQ files.
 36 |         covariate : str, optional
 37 |             Covariate to be used in data processing, default is None.
 38 |         organism : str, optional
 39 |             Organism filter for protein names, default is "HUMAN".
 40 | 
 41 |         Raises:
 42 |         FileNotFoundError
 43 |             If the specified data folder does not exist or is not a directory.
 44 | 
 45 |         Notes
 46 |         -----
 47 |         This method initializes various attributes for data processing, retrieves
 48 |         SDRF and iBAQ files from the specified folder, and filters protein data
 49 |         by the specified organism.
 50 |         """
 51 |         self.df_pca = compute_pca(self.df_corrected.T, n_components=5)
 52 |         self.df_corrected = None
 53 |         self.batch_index = get_batch_info_from_sample_names(self.df.columns.tolist())
 54 |         self.df_pca = None
 55 |         self.df_filtered_outliers = None
 56 |         self.batch_index = get_batch_info_from_sample_names(self.df.columns)
 57 |         self.samples_number = None
 58 |         self.datasets = None
 59 |         self.samples = self.df.columns.tolist()
 60 |         self.proteins = self.df["ProteinName"].unique().tolist()
 61 |         logger.info("Combining SDRFs and ibaq results ...")
 62 |         self.data_folder = Path(data_folder)
 63 |         if not self.data_folder.exists() or not self.data_folder.is_dir():
 64 |             raise FileNotFoundError(f"Data folder {self.data_folder} does not exsit!")
 65 |         self.covariate = covariate
 66 |         files = folder_retrieval(str(self.data_folder))
 67 |         self.metadata, self.df = pd.DataFrame(), pd.DataFrame()
 68 |         for sdrf in files["sdrf"]:
 69 |             sdrf_df = load_sdrf(sdrf)
 70 |             self.metadata = pd.concat([self.metadata, generate_meta(sdrf_df)])
 71 |         self.metadata = self.metadata.drop_duplicates()
 72 |         self.metadata.index = self.metadata["sample_id"]
 73 | 
 74 |         for ibaq in files["ibaq"]:
 75 |             self.df = pd.concat([self.df, load_feature(ibaq)])
 76 |         self.df = self.df[self.df["ProteinName"].str.endswith(organism)]
 77 |         self.df.index = self.df["SampleID"]
 78 |         self.df = self.df.join(self.metadata, how="left")
 79 |         logger.info(self.metadata, self.df.head)
 80 | 
 81 |     def read_data(self, meta: str, ibaq: str, organism="HUMAN", covariate=None):
 82 |         """
 83 |         Reads and processes iBAQ and metadata files, filtering protein data by organism.
 84 | 
 85 |         Parameters:
 86 |         meta : str
 87 |             Path to the metadata CSV file.
 88 |         ibaq : str
 89 |             Path to the iBAQ CSV file.
 90 |         organism : str, optional
 91 |             Organism filter for protein names, default is "HUMAN".
 92 |         covariate : str, optional
 93 |             Covariate to be used in data processing, default is None.
 94 | 
 95 |         Notes
 96 |         -----
 97 |         The method updates the instance's dataframe and metadata attributes by
 98 |         reading the specified files, filtering proteins by the given organism,
 99 |         and joining metadata to the iBAQ data.
100 |         """
101 | 
102 |         self.covariate = covariate
103 |         self.df = pd.read_csv(ibaq, index_col=0)
104 |         self.metadata = pd.read_csv(meta)
105 |         self.df = self.df[self.df["ProteinName"].str.endswith(organism)]
106 |         self.df.index = self.df["SampleID"]
107 |         self.metadata = self.metadata.drop_duplicates()
108 |         self.df = self.df.join(self.metadata, how="left")
109 | 
110 |     def imputer(self, covariate_to_keep: list = None):
111 |         """
112 |         Impute missing values in the combined iBAQ results DataFrame.
113 | 
114 |         This method processes the DataFrame by filtering, filling, and imputing
115 |         missing values based on specified covariates. It ensures that only columns
116 |         with a sufficient percentage of non-missing values are retained and performs
117 |         imputation using KNN or other specified methods.
118 | 
119 |         Parameters:
120 |         covariate_to_keep : list, optional
121 |             A list of covariate values to retain in the DataFrame. Only rows with
122 |             these covariate values will be kept.
123 | 
124 |         Raises:
125 |         SystemExit
126 |             If the specified covariate contains fewer than two unique values.
127 | 
128 |         Notes
129 |         -----
130 |         - The method modifies the instance's DataFrame by imputing missing values
131 |           and potentially altering its structure.
132 |         - The imputation process requires samples as columns and proteins as rows.
133 |         """
134 |         logger.info("Imputing merged ibaq results ...")
135 |         # Keep only columns 'sample_id' and covariate from df_metadata
136 |         if self.covariate:
137 |             if len(self.metadata[self.covariate].unique()) < 2:
138 |                 raise SystemExit(
139 |                     f"{self.covariate} should contain at least two different covariates!"
140 |                 )
141 | 
142 |         # Keep only rows within covariate_to_keep, you can keep tissue or tissue part you want.
143 |         if covariate_to_keep:
144 |             self.df = self.df[self.df[self.covariate].isin(covariate_to_keep)]
145 | 
146 |         # keep columns with at least 30% of non-missing values in each covariate_index group
147 |         self.df = filter_missing_value_by_group(
148 |             self.df, col="ProteinName", non_missing_percent_to_keep=0.3
149 |         )
150 | 
151 |         # TODO: Data for imputation should take samples as columns, proteins as rows. [Expression Matrix]
152 |         # Also need to fill the proteins didn't show in original results for each sample.
153 |         if self.covariate:
154 |             # split df by covariates
155 |             df_list = split_df_by_column(self.df, cov_index_col=self.covariate)
156 |             df_list = [fill_samples(df, self.proteins) for df in df_list]
157 | 
158 |             # impute missing values with KNNImputer for every df in df_list
159 |             df_list = impute_missing_values(df_list)
160 | 
161 |             # concatenate all dataframes in df_list into one dataframe
162 |             self.df = pd.concat(df_list, axis=1)
163 |         else:
164 |             self.df = fill_samples(self.df, self.proteins)
165 |             self.df = impute_missing_values(self.df)
166 | 
167 |         self.datasets = list(set([sample.split("-")[0] for sample in self.samples]))
168 |         logger.info(f"DataFrame head after imputation:\n{self.df.head()}")
169 | 
170 |     def outlier_removal(
171 |         self,
172 |         n_components: int = None,
173 |         min_cluster_size: int = None,
174 |         min_samples_num: int = None,
175 |         n_iter: int = None,
176 |     ):
177 |         """
178 |         Remove outliers from the imputed data using an iterative approach and plot the PCA results.
179 | 
180 |         This method applies iterative outlier removal on the imputed data, updates the filtered
181 |         DataFrame, and generates a PCA plot of the corrected data with outliers removed.
182 | 
183 |         Parameters:
184 |         n_components : int, optional
185 |             Number of principal components to compute. Defaults to a third of the unique batch indices.
186 |         min_cluster_size : int, optional
187 |             Minimum size of clusters for outlier detection. Defaults to the median number of samples per batch.
188 |         min_samples_num : int, optional
189 |             Minimum number of samples in a neighborhood for a point to be considered a core point.
190 |             Defaults to the median number of samples per batch.
191 |         n_iter : int, optional
192 |             Number of iterations for outlier removal. Defaults to 5.
193 | 
194 |         Notes
195 |         -----
196 |         - The method modifies the instance's DataFrame by removing outliers.
197 |         - A PCA plot is saved as 'pca_corrected_outliers_removed.png'.
198 |         """
199 |         logger.info("Removing outliers from imputed data ...")
200 |         # Apply iterative outlier removal on imputed data
201 |         # get batch indices from the columns names
202 |         batches = [sample.split("-")[0] for sample in self.samples]
203 |         self.samples_number = {dataset: batches.count(dataset) for dataset in self.datasets}
204 |         min_samples = round(np.median(list(self.samples_number.values())))
205 |         if min_samples == 1:
206 |             min_samples = 2
207 |         # apply iterative outlier removal
208 |         self.df_filtered_outliers = iterative_outlier_removal(
209 |             self.df,
210 |             self.batch_index,
211 |             n_components=(n_components if n_components else round(len(set(self.batch_index)) / 3)),
212 |             min_cluster_size=min_cluster_size if min_cluster_size else min_samples,
213 |             min_samples=min_samples_num if min_samples_num else min_samples,
214 |             n_iter=n_iter if n_iter else 5,
215 |         )
216 |         logger.info(self.df_filtered_outliers)
217 |         # plot PCA of corrected data with outliers removed
218 |         # transpose the dataframe to get samples as rows and features as columns
219 |         self.df_pca = compute_pca(
220 |             self.df_filtered_outliers.T,
221 |             n_components=(n_components if n_components else round(len(set(self.batch_index)) / 3)),
222 |         )
223 | 
224 |         # add batch information to the dataframe
225 |         self.df_pca["batch"] = self.df_pca.index.str.split("-").str[0]
226 | 
227 |         # plot PC1 vs PC2 with batch information using seaborn
228 |         # put the legend outside the plot
229 |         # save the plot as a png file
230 |         plot_pca(
231 |             self.df_pca,
232 |             title="PCA plot of corrected data with outliers removed",
233 |             output_file="pca_corrected_outliers_removed.png",
234 |         )
235 | 
236 |     def batch_correction(self, n_components: int = None, tissue_parts_to_keep: int = None):
237 |         """
238 |         Apply batch effect correction to the data and plot PCA results.
239 | 
240 |         This method performs batch correction on the data using specified covariates
241 |         and plots PCA before and after correction. It filters out batches with only
242 |         one sample and optionally retains specific tissue parts.
243 | 
244 |         Parameters:
245 |         n_components : int, optional
246 |             Number of principal components to compute. Defaults to a third of the unique batch indices.
247 |         tissue_parts_to_keep : int, optional
248 |             Number of tissue parts to retain in the data.
249 | 
250 |         Notes
251 |         -----
252 |         - The method modifies the instance's DataFrame by applying batch correction.
253 |         - PCA plots are saved as 'pca_uncorrected.png' and 'pca_corrected.png'.
254 |         """
255 |         logger.info("Applying batch effect correction ...")
256 |         # Plot PCA of uncorrected imputed data
257 |         # transpose the dataframe to get samples as rows and features as columns
258 |         self.df_pca = compute_pca(
259 |             self.df.T,
260 |             n_components=(n_components if n_components else round(len(set(self.batch_index)) / 3)),
261 |         )
262 | 
263 |         # add batch information to the dataframe
264 |         self.df_pca["batch"] = self.df_pca.index.str.split("-").str[0]
265 | 
266 |         # plot PC1 vs PC2 with batch information using seaborn
267 |         # put the legend outside the plot
268 |         # save the plot as a png file
269 |         plot_pca(
270 |             self.df_pca,
271 |             title="PCA plot of uncorrected data",
272 |             output_file="pca_uncorrected.png",
273 |         )
274 | 
275 |         # keep samples only in tissue_part from metadata
276 |         # TODO: specify covariates
277 |         if tissue_parts_to_keep:
278 |             self.metadata = self.metadata[self.metadata["tissue_part"].isin(tissue_parts_to_keep)]
279 |             samples_to_keep = self.metadata["sample_id"].tolist()
280 | 
281 |             # keep samples in df that are also in samples_to_keep
282 |             self.df = self.df[[s for s in self.df.columns if s in samples_to_keep]]
283 | 
284 |         # 2. Apply batch correction with covariate information
285 |         # Before apply batch correction, filter out batches with just one sample (otherwise the batch correction will fail).
286 |         batch_index = get_batch_info_from_sample_names(self.df.columns.tolist())
287 |         self.df = remove_single_sample_batches(self.df, batch_index)
288 | 
289 |         # get covariate information from metadata.
290 |         columns = self.df.columns.tolist()
291 |         self.metadata = self.metadata[self.metadata["sample_id"].isin(columns)]
292 |         # reorder metadata to match the order of columns in df
293 |         self.metadata = self.metadata.reset_index(drop=True)
294 |         self.metadata = self.metadata.set_index("sample_id").reindex(columns, axis=0).reset_index()
295 |         if self.covariate:
296 |             # get the covariates from metadata as a list
297 |             covariates_index = self.metadata[self.covariate].tolist()
298 |         else:
299 |             covariates_index = []
300 | 
301 |         # apply batch correction
302 |         self.df_corrected = apply_batch_correction(
303 |             self.df, self.batch_index, covs=covariates_index
304 |         )
305 |         logger.info(self.df_corrected)
306 | 
307 |         # plot PCA of corrected data
308 |         # transpose the dataframe to get samples as rows and features as columns
309 |         # add batch information to the dataframe
310 |         self.df_pca["batch"] = self.df_pca.index.str.split("-").str[0]
311 | 
312 |         # plot PC1 vs PC2 with batch information using seaborn
313 |         # put the legend outside the plot
314 |         # save the plot as a png file
315 |         plot_pca(
316 |             self.df_pca,
317 |             title="PCA plot of corrected data",
318 |             output_file="pca_corrected.png",
319 |         )
320 | 


--------------------------------------------------------------------------------
/ibaqpy/ibaq/file_utils.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import logging
  3 | import warnings
  4 | from typing import List, Optional, TYPE_CHECKING
  5 | 
  6 | import pandas as pd
  7 | 
  8 | from ibaqpy.ibaq.ibaqpy_postprocessing import pivot_wider
  9 | 
 10 | 
 11 | if TYPE_CHECKING:
 12 |     import anndata as an
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | logger.addHandler(logging.NullHandler())
 16 | 
 17 | 
 18 | def create_anndata(
 19 |     df: pd.DataFrame,
 20 |     obs_col: str,
 21 |     var_col: str,
 22 |     value_col: str,
 23 |     layer_cols: Optional[List[str]] = None,
 24 |     obs_metadata_cols: Optional[List[str]] = None,
 25 |     var_metadata_cols: Optional[List[str]] = None,
 26 | ) -> "an.AnnData":
 27 |     """
 28 |     Create an AnnData object from a long-format DataFrame.
 29 | 
 30 |     Parameters:
 31 |         df (pd.DataFrame): Input data in long format.
 32 |         obs_col (str): Column name in df representing observation IDs.
 33 |         var_col (str): Column name in df representing variable IDs.
 34 |         value_col (str): Column name in df representing the main data values.
 35 |         layer_cols (Optional[List[str]]): List of column names in df to add as additional layers.
 36 |         obs_metadata_cols (Optional[List[str]]): List of column names in df to add as observation metadata.
 37 |         var_metadata_cols (Optional[List[str]]): List of column names in df to add as variable metadata.
 38 | 
 39 |     Returns:
 40 |         anndata.AnnData: The constructed AnnData object.
 41 |     """
 42 | 
 43 |     import anndata as an
 44 | 
 45 |     if df.empty:
 46 |         raise ValueError("Cannot create AnnData object from empty DataFrame")
 47 |     # Validate that the required columns exist in the DataFrame.
 48 |     required_cols = [obs_col, var_col, value_col]
 49 |     missing = [col for col in required_cols if col not in df.columns]
 50 |     if missing:
 51 |         raise ValueError(
 52 |             f"The following required columns are missing from the input DataFrame: {missing}"
 53 |         )
 54 | 
 55 |     # Pivot the long dataframe to create a wide-format matrix for the main values.
 56 |     df_matrix = pivot_wider(df, row_name=obs_col, col_name=var_col, values=value_col, fillna=True)
 57 |     if df_matrix.empty:
 58 |         raise ValueError("Pivot operation resulted in an empty DataFrame")
 59 |     if df_matrix.shape[0] == 0 or df_matrix.shape[1] == 0:
 60 |         raise ValueError("Pivot operation resulted in a DataFrame with zero dimensions")
 61 | 
 62 |     # Create the AnnData object with the main data matrix.
 63 |     adata = an.AnnData(
 64 |         X=df_matrix.to_numpy(),
 65 |         obs=df_matrix.index.to_frame(),
 66 |         var=df_matrix.columns.to_frame(),
 67 |     )
 68 | 
 69 |     def add_metadata(metadata_df: pd.DataFrame, key: str, cols: List[str]) -> pd.DataFrame:
 70 |         """
 71 |         Add metadata columns to a DataFrame by mapping values from the original long dataframe.
 72 | 
 73 |         Parameters:
 74 |             metadata_df (pd.DataFrame): DataFrame (either adata.obs or adata.var) to update.
 75 |             key (str): The column name used as key (obs_col for observations, var_col for variables).
 76 |             cols (List[str]): List of metadata columns to add.
 77 | 
 78 |         Returns:
 79 |             pd.DataFrame: The updated metadata DataFrame.
 80 |         """
 81 |         for col in cols:
 82 |             if col not in df.columns:
 83 |                 warnings.warn(
 84 |                     f"Column '{col}' not found in the input DataFrame. Skipping metadata for '{col}'."
 85 |                 )
 86 |                 continue
 87 |             # Create a mapping from key to metadata values.
 88 |             mapping = df[[key, col]].drop_duplicates().set_index(key)[col]
 89 |             metadata_df[col] = metadata_df.index.map(mapping)
 90 |         return metadata_df
 91 | 
 92 |     # Add observation metadata, if provided.
 93 |     if obs_metadata_cols:
 94 |         adata.obs = add_metadata(adata.obs, obs_col, obs_metadata_cols)
 95 | 
 96 |     # Add variable metadata, if provided.
 97 |     if var_metadata_cols:
 98 |         adata.var = add_metadata(adata.var, var_col, var_metadata_cols)
 99 | 
100 |     # Add additional layers (if any) using a similar pivot operation.
101 |     if layer_cols:
102 |         for layer_col in layer_cols:
103 |             if layer_col not in df.columns:
104 |                 warnings.warn(
105 |                     f"Layer column '{layer_col}' not found in the input DataFrame. Skipping layer '{layer_col}'."
106 |                 )
107 |                 continue
108 |             df_layer = pivot_wider(
109 |                 df, row_name=obs_col, col_name=var_col, values=layer_col, fillna=True
110 |             )
111 |             adata.layers[layer_col] = df_layer.to_numpy()
112 | 
113 |     logger.info(f"Created AnnData object:\n {adata}")
114 | 
115 |     return adata
116 | 
117 | 
118 | def combine_ibaq_tsv_files(
119 |     dir_path: str, pattern: str = "*", comment: str = "#", sep: str = "\t"
120 | ) -> pd.DataFrame:
121 |     """
122 |     Combine multiple TSV files from a directory into a single pandas DataFrame.
123 | 
124 |     Parameters:
125 |     dir_path : str
126 |         Directory path containing the TSV files.
127 |     pattern : str, optional
128 |         Pattern to match files in the directory (default is '*').
129 |     comment : str, optional
130 |         Character to indicate the start of a comment line (default is '#').
131 |         It will skip lines starting with this character when reading the TSV files.
132 |     sep : str, optional
133 |         Delimiter to use for reading the TSV files (default is '\t').
134 | 
135 |     Returns:
136 |     Optional[pd.DataFrame]
137 |         Combined DataFrame containing data from all TSV files, or None if no files match the pattern.
138 | 
139 |     Examples
140 |     --------
141 |         dir_path = './ibaqpy-research-data/ibaq-hela-raw'
142 |         combined_df = combine_ibaq_tsv_files(dir_path, pattern='*ibaq.tsv', comment='#', sep='\t')
143 |     """
144 |     file_paths = glob.glob(f"{dir_path}/{pattern}")
145 | 
146 |     if not file_paths:
147 |         raise FileNotFoundError(
148 |             f"No files found in the directory '{dir_path}' matching the pattern '{pattern}'."
149 |         )
150 | 
151 |     dataframes = []
152 | 
153 |     first_schema = None
154 |     for file_path in file_paths:
155 |         try:
156 |             # Read the TSV file, skipping lines that start with the comment character
157 |             df = pd.read_csv(file_path, sep=sep, comment=comment)
158 | 
159 |             # Validate schema consistency
160 |             if first_schema is None:
161 |                 first_schema = set(df.columns)
162 |             elif set(df.columns) != first_schema:
163 |                 raise ValueError(
164 |                     f"Schema mismatch in file '{file_path}'. "
165 |                     f"Expected columns: {sorted(first_schema)}, "
166 |                     f"got: {sorted(df.columns)}"
167 |                 )
168 | 
169 |             dataframes.append(df)
170 |         except Exception as e:
171 |             raise ValueError(f"Error reading file '{file_path}': {str(e)}")
172 | 
173 |     # Concatenate all DataFrames
174 |     combined_df = pd.concat(dataframes, ignore_index=True)
175 | 
176 |     return combined_df
177 | 


--------------------------------------------------------------------------------
/ibaqpy/ibaq/ibaqpy_commons.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import matplotlib
  3 | import numpy as np
  4 | import pandas as pd
  5 | import seaborn as sns
  6 | 
  7 | from matplotlib import pyplot as plt
  8 | from matplotlib.figure import Figure
  9 | 
 10 | 
 11 | PROTEIN_NAME = "ProteinName"
 12 | PEPTIDE_SEQUENCE = "PeptideSequence"
 13 | PEPTIDE_CANONICAL = "PeptideCanonical"
 14 | PEPTIDE_CHARGE = "PrecursorCharge"
 15 | CHANNEL = "Channel"
 16 | MIXTRUE = "Mixture"
 17 | TECHREPMIXTURE = "TechRepMixture"
 18 | CONDITION = "Condition"
 19 | BIOREPLICATE = "BioReplicate"
 20 | TECHREPLICATE = "TechReplicate"
 21 | RUN = "Run"
 22 | FRACTION = "Fraction"
 23 | INTENSITY = "Intensity"
 24 | NORM_INTENSITY = "NormIntensity"
 25 | REFERENCE = "Reference"
 26 | SAMPLE_ID = "SampleID"
 27 | SAMPLE_ID_REGEX = r"^[A-Za-z0-9]+(?:-[A-Za-z0-9]+)*$"
 28 | SEARCH_ENGINE = "searchScore"
 29 | SCAN = "Scan"
 30 | MBR = "MatchBetweenRuns"
 31 | IBAQ = "Ibaq"
 32 | IBAQ_NORMALIZED = "IbaqNorm"
 33 | IBAQ_LOG = "IbaqLog"
 34 | IBAQ_BEC = "IbaqBec"
 35 | IBAQ_PPB = "IbaqPpb"
 36 | TPA = "TPA"
 37 | MOLECULARWEIGHT = "MolecularWeight"
 38 | COPYNUMBER = "CopyNumber"
 39 | CONCENTRATION_NM = "Concentration[nM]"
 40 | WEIGHT_NG = "Weight[ng]"
 41 | MOLES_NMOL = "Moles[nmol]"
 42 | GLOBALMEDIAN = "globalMedian"
 43 | CONDITIONMEDIAN = "conditionMedian"
 44 | 
 45 | 
 46 | PARQUET_COLUMNS = [
 47 |     "pg_accessions",
 48 |     "peptidoform",
 49 |     "sequence",
 50 |     "precursor_charge",
 51 |     "channel",
 52 |     "condition",
 53 |     "biological_replicate",
 54 |     "run",
 55 |     "fraction",
 56 |     "intensity",
 57 |     "reference_file_name",
 58 |     "sample_accession",
 59 | ]
 60 | 
 61 | 
 62 | parquet_map = {
 63 |     "pg_accessions": PROTEIN_NAME,
 64 |     "peptidoform": PEPTIDE_SEQUENCE,
 65 |     "sequence": PEPTIDE_CANONICAL,
 66 |     "precursor_charge": PEPTIDE_CHARGE,
 67 |     "channel": CHANNEL,
 68 |     "condition": CONDITION,
 69 |     "biological_replicate": BIOREPLICATE,
 70 |     "run": RUN,
 71 |     "fraction": FRACTION,
 72 |     "intensity": INTENSITY,
 73 |     "reference_file_name": REFERENCE,
 74 |     "sample_accession": SAMPLE_ID,
 75 | }
 76 | 
 77 | 
 78 | def get_accession(identifier: str) -> str:
 79 |     """
 80 |     Get protein accession from the identifier  (e.g. sp|P12345|PROT_NAME)
 81 |     :param identifier: Protein identifier
 82 |     :return: Protein accession
 83 |     """
 84 |     identifier_lst = identifier.split("|")
 85 |     if len(identifier_lst) == 1:
 86 |         return identifier_lst[0]
 87 |     else:
 88 |         return identifier_lst[1]
 89 | 
 90 | 
 91 | def plot_distributions(
 92 |     dataset: pd.DataFrame,
 93 |     field: str,
 94 |     class_field: str,
 95 |     title: str = "",
 96 |     log2: bool = True,
 97 |     width: float = 10,
 98 | ) -> Figure:
 99 |     """
100 |     Print the quantile plot for the dataset
101 |     :param dataset: DataFrame
102 |     :param field: Field that would be use in the dataframe to plot the quantile
103 |     :param class_field: Field to group the quantile into classes
104 |     :param title: Title of the box plot
105 |     :param log2: Log the intensity values
106 |     :param width: size of the plot
107 |     :return:
108 |     """
109 |     pd.set_option("mode.chained_assignment", None)
110 |     normalize = dataset[[field, class_field]].reset_index(drop=True)
111 |     if log2:
112 |         normalize[field] = np.log2(normalize[field])
113 |     normalize.dropna(subset=[field], inplace=True)
114 |     plt.figure(dpi=500, figsize=(width, 8))
115 |     fig = sns.kdeplot(data=normalize, x=field, hue=class_field, palette="Paired", linewidth=2)
116 |     sns.despine(ax=fig, top=True, right=True)
117 |     plt.title(title)
118 |     pd.set_option("mode.chained_assignment", "warn")
119 | 
120 |     return plt.gcf()
121 | 
122 | 
123 | def plot_box_plot(
124 |     dataset: pd.DataFrame,
125 |     field: str,
126 |     class_field: str,
127 |     log2: bool = False,
128 |     width: float = 10,
129 |     rotation: int = 30,
130 |     title: str = "",
131 |     violin: bool = False,
132 | ) -> Figure:
133 |     """
134 |     Plot a box plot of two values field and classes field
135 |     :param violin: Also add violin on top of box plot
136 |     :param dataset: Dataframe with peptide intensities
137 |     :param field: Intensity field
138 |     :param class_field: class to group the peptides
139 |     :param log2: transform peptide intensities to log scale
140 |     :param width: size of the plot
141 |     :param rotation: rotation of the x-axis
142 |     :param title: Title of the box plot
143 |     :return:
144 |     """
145 |     pd.set_option("mode.chained_assignment", None)
146 |     normalized = dataset[[field, class_field]]
147 |     np.seterr(divide="ignore")
148 |     plt.figure(figsize=(width, 14))
149 |     if log2:
150 |         normalized[field] = np.log2(normalized[field])
151 | 
152 |     if violin:
153 |         chart = sns.violinplot(
154 |             x=class_field,
155 |             y=field,
156 |             data=normalized,
157 |             boxprops=dict(alpha=0.3),
158 |             palette="muted",
159 |         )
160 |     else:
161 |         chart = sns.boxplot(
162 |             x=class_field,
163 |             y=field,
164 |             data=normalized,
165 |             boxprops=dict(alpha=0.3),
166 |             palette="muted",
167 |         )
168 | 
169 |     chart.set(title=title)
170 |     chart.set_xticklabels(chart.get_xticklabels(), rotation=rotation, ha="right")
171 |     pd.set_option("mode.chained_assignment", "warn")
172 | 
173 |     return plt.gcf()
174 | 
175 | 
176 | # Functions needed by Combiner
177 | def load_sdrf(sdrf_path: str) -> pd.DataFrame:
178 |     """
179 |     Load SDRF TSV as a dataframe.
180 | 
181 |     Parameters
182 |     ----------
183 |     sdrf_path : str
184 |         Path to SDRF TSV.
185 | 
186 |     Returns
187 |     -------
188 |     pd.DataFrame
189 |     """
190 |     if not os.path.exists(sdrf_path):
191 |         raise FileNotFoundError(f"{sdrf_path} does not exist!")
192 |     sdrf_df = pd.read_csv(sdrf_path, sep="\t")
193 |     sdrf_df.columns = [col.lower() for col in sdrf_df.columns]
194 |     return sdrf_df
195 | 
196 | 
197 | def load_feature(feature_path: str) -> pd.DataFrame:
198 |     """
199 |     Load feature file as a dataframe.
200 | 
201 |     Parameters
202 |     ----------
203 |     feature_path : str
204 |         Path to feature file.
205 | 
206 |     Returns
207 |     -------
208 |     pd.DataFrame
209 | 
210 |     Raises
211 |     ------
212 |     ValueError
213 |         If the provided file's suffix is not supported, either "parquet" or "csv
214 | 
215 |     """
216 |     suffix = os.path.splitext(feature_path)[1][1:]
217 |     if suffix == "parquet":
218 |         return pd.read_parquet(feature_path)
219 |     elif suffix == "csv":
220 |         return pd.read_csv(feature_path)
221 |     else:
222 |         raise ValueError(
223 |             f"{suffix} is not allowed as input, please provide msstats_in or feature parquet."
224 |         )
225 | 
226 | 
227 | def is_parquet(path: str) -> bool:
228 |     """
229 |     Check if a file is in Parquet format.
230 | 
231 |     This function attempts to open the specified file and read its header
232 |     to determine if it matches the Parquet file signature.
233 | 
234 |     Parameters
235 |     ----------
236 |     path : str
237 |         The file path to check.
238 | 
239 |     Returns
240 |     -------
241 |     bool
242 |         True if the file is a Parquet file, False otherwise.
243 |     """
244 |     try:
245 |         with open(path, "rb") as fh:
246 |             header = fh.read(4)
247 |         return header == b"PAR1"
248 |     except IOError:
249 |         return False
250 | 


--------------------------------------------------------------------------------
/ibaqpy/ibaq/ibaqpy_postprocessing.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Union
  3 | 
  4 | import pandas as pd
  5 | 
  6 | from ibaqpy.ibaq.ibaqpy_commons import (
  7 |     IBAQ,
  8 |     IBAQ_NORMALIZED,
  9 |     IBAQ_PPB,
 10 |     IBAQ_LOG,
 11 |     TPA,
 12 |     COPYNUMBER,
 13 |     PROTEIN_NAME,
 14 |     SAMPLE_ID,
 15 | )
 16 | 
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | logger.addHandler(logging.NullHandler())
 20 | 
 21 | 
 22 | def remove_samples_low_protein_number(ibaq_df: pd.DataFrame, min_protein_num: int) -> pd.DataFrame:
 23 |     """
 24 |     Remove samples with a low number of unique proteins from the DataFrame.
 25 | 
 26 |     This function filters out samples from the given DataFrame that have fewer
 27 |     unique proteins than the specified minimum threshold.
 28 | 
 29 |     Parameters
 30 |     ----------
 31 |     ibaq_df : pd.DataFrame
 32 |         The input DataFrame containing iBAQ data.
 33 |     min_protein_num : int
 34 |         The minimum number of unique proteins required to keep a sample.
 35 | 
 36 |     Returns
 37 |     -------
 38 |     pd.DataFrame
 39 |         A filtered DataFrame containing only samples with at least the specified number of unique proteins.
 40 |     """
 41 | 
 42 |     protein_num = ibaq_df.groupby(SAMPLE_ID)[PROTEIN_NAME].nunique()
 43 | 
 44 |     # Get the samples with more than min_protein_num proteins
 45 |     samples_to_keep = protein_num[protein_num >= min_protein_num].index
 46 |     samples_to_remove = protein_num[protein_num < min_protein_num].index
 47 | 
 48 |     logger.info(
 49 |         "The number of samples with number of proteins lower than {} is {}".format(
 50 |             min_protein_num, len(samples_to_remove)
 51 |         )
 52 |     )
 53 | 
 54 |     # Filter the samples
 55 |     ibaq_df = ibaq_df[ibaq_df["SampleID"].isin(samples_to_keep)]
 56 |     return ibaq_df
 57 | 
 58 | 
 59 | def remove_missing_values(
 60 |     ibaq_df: pd.DataFrame,
 61 |     missingness_percentage: float = 30,
 62 |     expression_column: str = IBAQ,
 63 | ) -> pd.DataFrame:
 64 |     """
 65 |     Remove samples from the DataFrame based on missing values in the expression column.
 66 | 
 67 |     This function filters out samples from the input DataFrame where the percentage
 68 |     of missing values in the specified expression column exceeds the given threshold.
 69 | 
 70 |     Parameters
 71 |     ----------
 72 |     ibaq_df : pd.DataFrame
 73 |         The input DataFrame containing iBAQ data.
 74 |     missingness_percentage : float
 75 |         The threshold percentage of missing values allowed per sample.
 76 |     expression_column : str
 77 |         The column name in the DataFrame representing expression values.
 78 | 
 79 |     Returns
 80 |     -------
 81 |     pd.DataFrame
 82 |         A DataFrame with samples filtered based on the missingness criteria.
 83 | 
 84 |     Raises
 85 |     ------
 86 |     ValueError
 87 |         If the input is not a DataFrame or if the expression column is not present.
 88 |     """
 89 | 
 90 |     # Ensure the input is a DataFrame
 91 |     if not isinstance(ibaq_df, pd.DataFrame):
 92 |         raise ValueError("The input ibaq_df must be a pandas DataFrame.")
 93 | 
 94 |     if expression_column not in ibaq_df.columns:
 95 |         raise ValueError(f"The expression column '{expression_column}' is not in the DataFrame.")
 96 | 
 97 |     # Initial number of samples
 98 |     initial_sample_count = ibaq_df["SampleID"].nunique()
 99 |     logger.info(f"Initial number of samples: {initial_sample_count}")
100 | 
101 |     # Create a pivot table to organize data by ProteinName and SampleID
102 |     pivot_df = ibaq_df.pivot_table(index=PROTEIN_NAME, columns=SAMPLE_ID, values=expression_column)
103 | 
104 |     # Remove samples where all proteins have missing values
105 |     non_missing_samples = pivot_df.columns[pivot_df.notna().any(axis=0)]
106 | 
107 |     # Compute missingness percentage per sample
108 |     missingness = pivot_df[non_missing_samples].isna().sum() / len(pivot_df) * 100
109 | 
110 |     # Filter samples based on the missingness percentage threshold
111 |     valid_samples = missingness[missingness <= missingness_percentage].index
112 | 
113 |     # Filter the original DataFrame for valid samples
114 |     filtered_df = ibaq_df[ibaq_df[SAMPLE_ID].isin(valid_samples)]
115 | 
116 |     # Final number of samples
117 |     final_sample_count = filtered_df[SAMPLE_ID].nunique()
118 |     logger.info(f"Final number of samples: {final_sample_count}")
119 | 
120 |     removed_sample_count = initial_sample_count - final_sample_count
121 |     logger.info(f"Number of samples removed: {removed_sample_count}")
122 | 
123 |     return filtered_df
124 | 
125 | 
126 | def describe_expression_metrics(ibaq_df: pd.DataFrame) -> pd.DataFrame:
127 |     """
128 |     Generate descriptive statistics for expression metrics in an iBAQ DataFrame.
129 | 
130 |     This function calculates descriptive statistics for specific expression
131 |     metrics within the provided iBAQ DataFrame, grouped by sample ID.
132 | 
133 |     Parameters
134 |     ----------
135 |     ibaq_df : pd.DataFrame
136 |         The DataFrame containing iBAQ expression data.
137 | 
138 |     Returns:
139 |     pd.DataFrame:
140 |         A DataFrame with descriptive statistics for the expression metrics, grouped by
141 |         sample ID.
142 |     """
143 | 
144 |     possible_expression_values = [
145 |         IBAQ,
146 |         IBAQ_NORMALIZED,
147 |         IBAQ_LOG,
148 |         IBAQ_PPB,
149 |         TPA,
150 |         COPYNUMBER,
151 |     ]
152 | 
153 |     # Define the expression columns
154 |     expression_columns = [col for col in ibaq_df.columns if col in possible_expression_values]
155 | 
156 |     # Get the metrics
157 |     metrics = ibaq_df.groupby(SAMPLE_ID)[expression_columns].describe()
158 |     return metrics
159 | 
160 | 
161 | def pivot_wider(
162 |     df: pd.DataFrame,
163 |     row_name: str,
164 |     col_name: str,
165 |     values: str,
166 |     fillna: Union[int, float, bool] = False,
167 | ) -> pd.DataFrame:
168 |     """
169 |     Create a matrix from a DataFrame given the row, column, and value columns.
170 | 
171 |     Parameters
172 |     ----------
173 |     df : pd.DataFrame
174 |         The input DataFrame in long format.
175 |     row_name : str
176 |         The column name to use as row labels (e.g., sample_ids).
177 |     col_name : str
178 |         The column name to use as column labels (e.g., protein_names).
179 |     values : str
180 |         The column name to use as cell values (e.g., expression_values).
181 |     fillna : Optional[Union[bool, int, float]]
182 |         Value to fill NaN. If True, fill NaN with 0. If False or None, leave NaN as is.
183 |         If a number is provided, use that value.
184 | 
185 |     Returns
186 |     -------
187 |     pd.DataFrame
188 |         A pivot table (matrix) with specified rows, columns, and values.
189 | 
190 |     Examples
191 |     --------
192 |     >>> df_matrix =  pivot_wider(combined_df,
193 |                     row_name='SampleID',
194 |                     col_name='ProteinName',
195 |                     values='Ibaq',
196 |                     fillna=False)
197 |     """
198 |     # Check if the provided columns exist in the DataFrame
199 |     missing_columns = {row_name, col_name, values} - set(df.columns)
200 |     if missing_columns:
201 |         raise ValueError(f"Columns {missing_columns} not found in the DataFrame.")
202 | 
203 |     # Check for duplicate combinations
204 |     duplicates = df.groupby([row_name, col_name]).size()
205 |     if (duplicates > 1).any():
206 |         raise ValueError(
207 |             f"Found duplicate combinations of {row_name} and {col_name}. "
208 |             "Use an aggregation function to handle duplicates."
209 |         )
210 | 
211 |     # Use pivot_table to create the matrix
212 |     matrix = df.pivot_table(index=row_name, columns=col_name, values=values, aggfunc="first")
213 | 
214 |     # Simplified NaN handling
215 |     if fillna is True:  # Fill with 0 if True
216 |         matrix = matrix.fillna(0)
217 |     elif fillna not in [None, False]:  # Fill if a specific value is provided
218 |         matrix = matrix.fillna(fillna)
219 | 
220 |     return matrix
221 | 
222 | 
223 | def pivot_longer(df: pd.DataFrame, row_name: str, col_name: str, values: str) -> pd.DataFrame:
224 |     """
225 |     Transforms a wide-format DataFrame into a long-format DataFrame.
226 | 
227 |     This function takes a DataFrame and pivots it from a wide format to a long format
228 |     using the specified row name, column name, and values. It validates the input
229 |     DataFrame and checks for the existence of the specified row name. The function
230 |     also logs a warning if any missing values are found in the resulting DataFrame.
231 | 
232 |     Parameters
233 |     ----------
234 |     df : pd.DataFrame
235 |         The input DataFrame to be transformed.
236 |     row_name : str
237 |         The name of the column to use as the identifier variable.
238 |     col_name : str
239 |         The name for the new column that will contain the former column names.
240 |     values : str
241 |         The name for the new column that will contain the values.
242 | 
243 |     Returns
244 |     -------
245 |     pd.DataFrame
246 |         A long-format DataFrame with specified column names.
247 |     """
248 |     # Validate input DataFrame
249 |     if not isinstance(df, pd.DataFrame):
250 |         raise ValueError("Input must be a pandas DataFrame")
251 | 
252 |     # Validate row_name exists in DataFrame
253 |     if row_name not in df.columns:
254 |         raise ValueError(f"Row name '{row_name}' not found in DataFrame")
255 | 
256 |     # Reset the index to convert the row labels to a column
257 |     matrix_reset = df.reset_index()
258 | 
259 |     # Use pd.melt to convert the wide-format DataFrame to long-format
260 |     long_df = pd.melt(matrix_reset, id_vars=[row_name], var_name=col_name, value_name=values)
261 | 
262 |     # Remove rows with missing values if any
263 |     if long_df[values].isna().any():
264 |         logging.warning(f"Found {long_df[values].isna().sum()} missing values in the result")
265 | 
266 |     return long_df
267 | 


--------------------------------------------------------------------------------
/ibaqpy/ibaq/imputation_methods.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Union, List
 2 | 
 3 | import pandas as pd
 4 | from sklearn.impute import KNNImputer, SimpleImputer
 5 | 
 6 | 
 7 | def impute_missing_values(
 8 |     data: Optional[Union[pd.DataFrame, List[pd.DataFrame], None]],
 9 |     method: str = "knn",
10 |     n_neighbors: int = 5,
11 |     weights: str = "uniform",
12 |     metric: str = "nan_euclidean",
13 |     keep_empty_features: bool = True,
14 |     fill_value: float = 0.0,
15 | ) -> Union[pd.DataFrame, List[pd.DataFrame], None]:
16 |     """
17 |     Impute missing values in a DataFrame or a list of DataFrames using KNN, mean, median, most frequent, or a specific value.
18 | 
19 |     Parameters:
20 |     data : Optional[Union[pd.DataFrame, List[pd.DataFrame]]]
21 |         A pandas DataFrame or a list of pandas DataFrames containing missing values to be imputed.
22 |         The DataFrame(s) must adhere to the following format:
23 |         - Rows represent samples (observations).
24 |         - Columns represent features (variables).
25 |         - Contain only numerical columns (e.g., float or int).
26 |         - Missing values must be explicitly represented as `np.nan` or `pd.NA`.
27 |         - Columns with non-numerical data (e.g., categorical or text) should be preprocessed
28 |           (e.g., encoded into numerical values) before using this function.
29 |         - Features (columns) with entirely missing values are handled based on the
30 |           `keep_empty_features` parameter.
31 |     method : str, optional
32 |         The imputation method to use. Options are:
33 |         - "knn" (default): Use K-Nearest Neighbors imputation.
34 |         - "mean": Impute using the mean of each column.
35 |         - "median": Impute using the median of each column.
36 |         - "most_frequent": Impute using the most frequent value of each column.
37 |         - "constant": Impute using a specific value provided via `fill_value`.
38 |     n_neighbors : int, optional
39 |         The number of neighboring samples to use for KNN imputation. Default is 5.
40 |     weights : str, optional
41 |         The weight function used in KNN prediction. Can be 'uniform' or 'distance'. Default is 'uniform'.
42 |     metric : str, optional
43 |         The distance metric used for finding neighbors in KNN. Default is 'nan_euclidean'.
44 |     fill_value : float, optional
45 |         The constant value to use for imputation when `method` is "constant". Default is 0.0.
46 |     keep_empty_features : bool, optional
47 |         Whether to keep features that are entirely empty (i.e., all values are NaN). Default is True.
48 | 
49 |     Returns:
50 |     Union[pd.DataFrame, List[pd.DataFrame]]
51 |         A pandas DataFrame or a list of pandas DataFrames with imputed missing values.
52 |         If the input is None, the function will return None.
53 | 
54 |     Notes
55 |     -----
56 |     - This function uses sklearn's KNNImputer and SimpleImputer for imputing missing values.
57 |     - The `nan_euclidean` metric is specifically designed to handle NaN values during distance computation.
58 |     - Column names and indices are preserved in the output.
59 |     - Ensure the input data is numerical and properly formatted for the imputer.
60 |     """
61 |     if data is None:
62 |         # placeholder for further implementation
63 |         return None
64 | 
65 |     if method not in {"knn", "mean", "median", "constant", "most_frequent"}:
66 |         raise ValueError(
67 |             "Invalid method. Choose from 'knn', 'mean', 'median', 'most_frequent', or 'constant'."
68 |         )
69 | 
70 |     if method == "knn":
71 |         imputer = KNNImputer(
72 |             n_neighbors=n_neighbors,
73 |             weights=weights,
74 |             metric=metric,
75 |             keep_empty_features=keep_empty_features,
76 |         )
77 |     else:
78 |         strategy = method
79 |         imputer = SimpleImputer(strategy=strategy, fill_value=fill_value)
80 | 
81 |     def impute(df: pd.DataFrame) -> pd.DataFrame:
82 |         imputed_data = imputer.fit_transform(df)
83 |         return pd.DataFrame(imputed_data, columns=df.columns, index=df.index)
84 | 
85 |     if isinstance(data, pd.DataFrame):
86 |         # Impute missing values for a single DataFrame
87 |         return impute(data)
88 |     elif isinstance(data, list) and all(isinstance(df, pd.DataFrame) for df in data):
89 |         # Impute missing values for a list of DataFrames
90 |         return [impute(df) for df in data]
91 |     else:
92 |         raise ValueError(
93 |             "The input data must be a pandas DataFrame, a list of DataFrames, or None."
94 |         )
95 | 


--------------------------------------------------------------------------------
/ibaqpy/ibaq/logger.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import sys
  4 | from typing import Optional, Dict, Any, Union
  5 | from datetime import datetime
  6 | 
  7 | # Default log format
  8 | DEFAULT_LOG_FORMAT = "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
  9 | DEFAULT_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
 10 | 
 11 | # Log levels dictionary for easy conversion from string
 12 | LOG_LEVELS = {
 13 |     "debug": logging.DEBUG,
 14 |     "info": logging.INFO,
 15 |     "warning": logging.WARNING,
 16 |     "error": logging.ERROR,
 17 |     "critical": logging.CRITICAL,
 18 | }
 19 | 
 20 | 
 21 | class ContextAdapter(logging.LoggerAdapter):
 22 |     """
 23 |     A logger adapter that adds context information to log messages.
 24 |     This makes logs more useful for debugging by providing additional context.
 25 |     """
 26 | 
 27 |     def process(self, msg, kwargs):
 28 |         if self.extra:
 29 |             context_str = " ".join(f"{k}={v}" for k, v in self.extra.items())
 30 |             return f"{msg} [{context_str}]", kwargs
 31 |         return msg, kwargs
 32 | 
 33 | 
 34 | def get_logger(
 35 |     name: str, context: Optional[Dict[str, Any]] = None
 36 | ) -> Union[logging.Logger, logging.LoggerAdapter]:
 37 |     """
 38 |     Get a logger with the specified name and optional context.
 39 | 
 40 |     Args:
 41 |         name: The name of the logger
 42 |         context: Optional dictionary of context values to include in log messages
 43 | 
 44 |     Returns:
 45 |         A logger or logger adapter with the specified name and context
 46 |     """
 47 |     logger = logging.getLogger(name)
 48 | 
 49 |     # If context is provided, return a ContextAdapter
 50 |     if context:
 51 |         return ContextAdapter(logger, context)
 52 | 
 53 |     return logger
 54 | 
 55 | 
 56 | def configure_logging(
 57 |     level: str = "info",
 58 |     log_file: Optional[str] = None,
 59 |     log_format: str = DEFAULT_LOG_FORMAT,
 60 |     date_format: str = DEFAULT_DATE_FORMAT,
 61 |     propagate: bool = True,
 62 | ) -> None:
 63 |     """
 64 |     Configure the logging system for the application.
 65 | 
 66 |     Args:
 67 |         level: The log level (debug, info, warning, error, critical)
 68 |         log_file: Optional path to a log file
 69 |         log_format: The format string for log messages
 70 |         date_format: The format string for timestamps
 71 |         propagate: Whether to propagate logs to parent loggers
 72 |     """
 73 |     # Convert level string to logging level
 74 |     log_level = LOG_LEVELS.get(level.lower(), logging.INFO)
 75 | 
 76 |     # Configure root logger
 77 |     root_logger = logging.getLogger()
 78 |     root_logger.setLevel(log_level)
 79 | 
 80 |     # Remove existing handlers to avoid duplicate logs
 81 |     for handler in root_logger.handlers[:]:
 82 |         root_logger.removeHandler(handler)
 83 | 
 84 |     # Create formatter
 85 |     formatter = logging.Formatter(log_format, date_format)
 86 | 
 87 |     # Configure console handler
 88 |     console_handler = logging.StreamHandler(sys.stdout)
 89 |     console_handler.setFormatter(formatter)
 90 |     root_logger.addHandler(console_handler)
 91 | 
 92 |     # Configure file handler if log_file is specified
 93 |     if log_file:
 94 |         os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
 95 |         file_handler = logging.FileHandler(log_file)
 96 |         file_handler.setFormatter(formatter)
 97 |         root_logger.addHandler(file_handler)
 98 | 
 99 |     # Configure ibaqpy loggers
100 |     ibaqpy_logger = logging.getLogger("ibaqpy")
101 |     ibaqpy_logger.setLevel(log_level)
102 |     ibaqpy_logger.propagate = propagate
103 | 
104 | 
105 | def log_execution_time(
106 |     logger: Union[logging.Logger, logging.LoggerAdapter], level: int = logging.INFO
107 | ):
108 |     """
109 |     Decorator to log the execution time of a function.
110 | 
111 |     Args:
112 |         logger: The logger to use
113 |         level: The log level to use
114 | 
115 |     Returns:
116 |         A decorator that logs the execution time of the decorated function
117 |     """
118 | 
119 |     def decorator(func):
120 |         def wrapper(*args, **kwargs):
121 |             start_time = datetime.now()
122 |             logger.log(level, f"Starting {func.__name__}")
123 | 
124 |             try:
125 |                 result = func(*args, **kwargs)
126 |                 end_time = datetime.now()
127 |                 execution_time = end_time - start_time
128 |                 logger.log(level, f"Completed {func.__name__} in {execution_time}")
129 |                 return result
130 |             except Exception as e:
131 |                 end_time = datetime.now()
132 |                 execution_time = end_time - start_time
133 |                 logger.exception(f"Error in {func.__name__} after {execution_time}: {str(e)}")
134 |                 raise
135 | 
136 |         return wrapper
137 | 
138 |     return decorator
139 | 
140 | 
141 | def log_function_call(
142 |     logger: Union[logging.Logger, logging.LoggerAdapter], level: int = logging.DEBUG
143 | ):
144 |     """
145 |     Decorator to log function calls with arguments.
146 | 
147 |     Args:
148 |         logger: The logger to use
149 |         level: The log level to use
150 | 
151 |     Returns:
152 |         A decorator that logs function calls with arguments
153 |     """
154 | 
155 |     def decorator(func):
156 |         def wrapper(*args, **kwargs):
157 |             args_str = ", ".join([str(arg) for arg in args])
158 |             kwargs_str = ", ".join([f"{k}={v}" for k, v in kwargs.items()])
159 |             all_args = ", ".join(filter(None, [args_str, kwargs_str]))
160 | 
161 |             logger.log(level, f"Calling {func.__name__}({all_args})")
162 |             return func(*args, **kwargs)
163 | 
164 |         return wrapper
165 | 
166 |     return decorator


--------------------------------------------------------------------------------
/ibaqpy/ibaq/logging_config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Logging configuration for the ibaqpy package.
 3 | 
 4 | This module provides functions to configure the logging system for the ibaqpy package.
 5 | It should be imported and initialized at the start of the application.
 6 | """
 7 | 
 8 | import os
 9 | import logging
10 | from typing import Optional
11 | 
12 | from ibaqpy.ibaq.logger import configure_logging
13 | 
14 | 
15 | def initialize_logging(
16 |     level: str = "info",
17 |     log_file: Optional[str] = None,
18 |     log_format: Optional[str] = None,
19 |     date_format: Optional[str] = None,
20 | ) -> None:
21 |     """
22 |     Initialize the logging system for the ibaqpy package.
23 | 
24 |     This function should be called at the start of the application to configure
25 |     the logging system. It sets up console and file logging with appropriate
26 |     formatting.
27 | 
28 |     Args:
29 |         level: The log level (debug, info, warning, error, critical)
30 |         log_file: Optional path to a log file
31 |         log_format: Optional format string for log messages
32 |         date_format: Optional format string for timestamps
33 |     """
34 |     # Use environment variables if available
35 |     env_level = os.environ.get("IBAQPY_LOG_LEVEL", level)
36 |     env_log_file = os.environ.get("IBAQPY_LOG_FILE", log_file)
37 | 
38 |     # Configure logging
39 |     configure_logging(
40 |         level=env_level,
41 |         log_file=env_log_file,
42 |         log_format=log_format,
43 |         date_format=date_format,
44 |     )
45 | 
46 |     # Log initialization
47 |     logger = logging.getLogger("ibaqpy")
48 |     logger.info("Logging initialized at level %s", env_level.upper())
49 |     if env_log_file:
50 |         logger.info("Log file: %s", env_log_file)
51 | 
52 | 
53 | def get_log_file_path(base_dir: Optional[str] = None) -> str:
54 |     """
55 |     Get a default log file path based on the current date.
56 | 
57 |     Args:
58 |         base_dir: Optional base directory for log files
59 | 
60 |     Returns:
61 |         A path to a log file
62 |     """
63 |     import datetime
64 | 
65 |     # Default to logs directory in current working directory
66 |     if base_dir is None:
67 |         base_dir = os.path.join(os.getcwd(), "logs")
68 | 
69 |     # Create logs directory if it doesn't exist
70 |     os.makedirs(base_dir, exist_ok=True)
71 | 
72 |     # Create log file name based on current date
73 |     date_str = datetime.datetime.now().strftime("%Y-%m-%d")
74 |     log_file = os.path.join(base_dir, f"ibaqpy_{date_str}.log")
75 | 
76 |     return log_file


--------------------------------------------------------------------------------
/ibaqpy/ibaq/write_queue.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from threading import Thread
  4 | from queue import Queue, Empty
  5 | from typing import Any
  6 | 
  7 | import pandas as pd
  8 | import pyarrow as pa
  9 | from pyarrow import parquet as pq
 10 | 
 11 | from .logger import get_logger
 12 | 
 13 | # Get a logger for this module
 14 | logger = get_logger("ibaqpy.write_queue")
 15 | 
 16 | 
 17 | class WriteCSVTask(Thread):
 18 |     """
 19 |     A thread-based task for writing pandas DataFrames to a CSV file.
 20 | 
 21 |     This class extends the Thread class to asynchronously write DataFrames
 22 |     to a specified CSV file path. It manages a queue to handle incoming
 23 |     DataFrames and writes them to the file in the order they are received.
 24 |     The CSV file is created with an optional header, and additional write
 25 |     options can be specified.
 26 | 
 27 |     Attributes:
 28 |         path (str): The file path where the CSV will be written.
 29 |         write_options (dict[str, Any]): Options for writing the CSV file.
 30 |         _queue (Queue): A queue to manage DataFrames to be written.
 31 |         _wrote_header (bool): Indicates if the CSV header has been written.
 32 | 
 33 |     Methods:
 34 |         write(table: pd.DataFrame): Adds a DataFrame to the queue for writing.
 35 |         close(): Signals the thread to finish processing and close.
 36 |         run(): Continuously processes the queue to write DataFrames to the CSV.
 37 |     """
 38 | 
 39 |     path: str
 40 | 
 41 |     write_options: dict[str, Any]
 42 | 
 43 |     _queue: Queue
 44 |     _wrote_header: bool
 45 | 
 46 |     def __init__(self, path: str, daemon: bool = True, write_options: dict = None, **kwargs):
 47 |         """
 48 |         Initializes a WriteCSVTask instance.
 49 | 
 50 |         Parameters:
 51 |             path (str): The file path where the CSV will be written. The extension
 52 |                 will be automatically set to '.csv'.
 53 |             daemon (bool, optional): Whether the thread should be a daemon thread.
 54 |                 Defaults to True.
 55 |             write_options (dict, optional): Additional options for writing the CSV
 56 |                 file. Defaults to None.
 57 |             **kwargs: Additional keyword arguments to be merged with write_options.
 58 | 
 59 |         Attributes:
 60 |             path (str): The file path for the CSV file.
 61 |             write_options (dict): Options for writing the CSV file.
 62 |             _wrote_header (bool): Indicates if the CSV header has been written.
 63 |             _queue (Queue): A queue to manage DataFrames to be written.
 64 |         """
 65 |         super().__init__(daemon=daemon)
 66 |         if write_options is None:
 67 |             write_options = {}
 68 | 
 69 |         path, _ext = os.path.splitext(path)
 70 |         path += ".csv"
 71 | 
 72 |         self.path = path
 73 |         self.write_options = write_options | kwargs
 74 |         self._wrote_header = False
 75 |         self._queue = Queue()
 76 | 
 77 |     def write(self, table: pd.DataFrame):
 78 |         """
 79 |         Adds a DataFrame to the queue for writing to the CSV file.
 80 | 
 81 |         Parameters:
 82 |             table (pd.DataFrame): The DataFrame to be added to the queue.
 83 |         """
 84 |         logger.debug("Queuing %d rows for CSV writing to %s", len(table), self.path)
 85 |         self._queue.put(table)
 86 | 
 87 |     def close(self):
 88 |         """
 89 |         Signals the thread to finish processing and close the file.
 90 |         """
 91 |         logger.debug("Closing CSV writer queue for %s", self.path)
 92 |         self._queue.put(None)
 93 |         self.join()
 94 | 
 95 |     def _write(self, table: pd.DataFrame):
 96 |         """
 97 |         Writes a DataFrame to the CSV file specified by the path attribute.
 98 | 
 99 |         This method appends the DataFrame to the CSV file if the header has
100 |         already been written; otherwise, it writes the DataFrame with the header.
101 |         The writing options are specified by the write_options attribute.
102 | 
103 |         Parameters:
104 |             table (pd.DataFrame): The DataFrame to be written to the CSV file.
105 |         """
106 |         start_time = time.time()
107 |         rows = len(table)
108 | 
109 |         try:
110 |             table.to_csv(
111 |                 self.path,
112 |                 header=not self._wrote_header,
113 |                 mode="a+" if self._wrote_header else "w",
114 |                 index=False,
115 |                 **self.write_options,
116 |             )
117 |             self._wrote_header = True
118 | 
119 |             elapsed = time.time() - start_time
120 |             logger.debug("Wrote %d rows to CSV file %s in %.2f seconds", rows, self.path, elapsed)
121 |         except Exception as e:
122 |             logger.error("Error writing to CSV file %s: %s", self.path, str(e))
123 |             raise
124 | 
125 |     def _close(self):
126 |         logger.debug("Closing CSV writer for %s", self.path)
127 | 
128 |     def run(self):
129 |         """
130 |         Continuously processes the queue to write DataFrames to the CSV file.
131 | 
132 |         This method runs in a loop, retrieving DataFrames from the queue and
133 |         writing them to the CSV file using the _write method. The loop exits
134 |         when a None value is encountered in the queue, signaling the end of
135 |         the writing process.
136 |         """
137 |         while True:
138 |             try:
139 |                 table: pd.DataFrame = self._queue.get(True)
140 |             except Empty:
141 |                 continue
142 | 
143 |             if table is None:
144 |                 break
145 | 
146 |             self._write(table)
147 | 
148 | 
149 | class WriteParquetTask(Thread):
150 |     """
151 |     A thread-based task for writing pandas DataFrames to a Parquet file.
152 | 
153 |     This class extends the Thread class to asynchronously write DataFrames
154 |     to a Parquet file using a queue. It manages the ParquetWriter and schema
155 |     internally, ensuring that data is written efficiently and safely.
156 | 
157 |     Attributes:
158 |         path (str): The file path where the Parquet file will be written.
159 |         metadata (dict[str, Any]): Metadata to be added to the Parquet file.
160 |         _queue (Queue): A queue to hold DataFrames to be written.
161 |         _schema (pa.Schema): The schema of the Parquet file.
162 |         _writer (pq.ParquetWriter): The writer object for the Parquet file.
163 | 
164 |     Methods:
165 |         write(table: pd.DataFrame): Adds a DataFrame to the queue for writing.
166 |         close(): Signals the thread to finish writing and close the file.
167 |         run(): The main loop of the thread, processing the queue.
168 |     """
169 | 
170 |     path: str
171 |     metadata: dict[str, Any]
172 | 
173 |     _queue: Queue
174 | 
175 |     _schema: pa.Schema
176 |     _writer: pq.ParquetWriter
177 | 
178 |     def __init__(self, path: str, daemon: bool = True, metadata: dict = None, **kwargs):
179 |         super().__init__(daemon=daemon)
180 | 
181 |         if metadata is None:
182 |             metadata = {}
183 |         path, _ext = os.path.splitext(path)
184 |         path += ".parquet"
185 | 
186 |         self.path = path
187 |         self.metadata = metadata | kwargs
188 |         self._queue = Queue()
189 |         self._writer = None
190 |         self._schema = None
191 | 
192 |     def write(self, table: pd.DataFrame):
193 |         """
194 |         Adds a DataFrame to the queue for writing to the Parquet file.
195 | 
196 |         Parameters:
197 |             table (pd.DataFrame): The DataFrame to be added to the queue.
198 |         """
199 |         logger.debug("Queuing %d rows for Parquet writing to %s", len(table), self.path)
200 |         self._queue.put(table)
201 | 
202 |     def close(self):
203 |         """
204 |         Signals the thread to finish processing and close the file.
205 |         """
206 |         logger.debug("Closing Parquet writer queue for %s", self.path)
207 |         self._queue.put(None)
208 |         self.join()
209 | 
210 |     def _close(self):
211 |         logger.debug("Closing Parquet writer for %s", self.path)
212 |         self._writer.add_key_value_metadata(self.metadata)
213 |         self._writer.close()
214 | 
215 |     def _write(self, table: pd.DataFrame):
216 |         start_time = time.time()
217 |         rows = len(table)
218 | 
219 |         try:
220 |             if self._schema is None:
221 |                 self._schema = pa.Schema.from_pandas(table, preserve_index=False)
222 |                 self._writer = pq.ParquetWriter(self.path, schema=self._schema)
223 |                 logger.debug("Initialized Parquet writer for %s", self.path)
224 | 
225 |             arrow_table = pa.Table.from_pandas(table, preserve_index=False)
226 |             self._writer.write_table(arrow_table)
227 | 
228 |             elapsed = time.time() - start_time
229 |             logger.debug(
230 |                 "Wrote %d rows to Parquet file %s in %.2f seconds", rows, self.path, elapsed
231 |             )
232 |         except Exception as e:
233 |             logger.error("Error writing to Parquet file %s: %s", self.path, str(e))
234 |             raise
235 | 
236 |     def run(self):
237 |         while True:
238 |             try:
239 |                 table: pd.DataFrame = self._queue.get(True)
240 |             except Empty:
241 |                 continue
242 | 
243 |             if table is None:
244 |                 break
245 | 
246 |             self._write(table)
247 | 
248 |         self._close()


--------------------------------------------------------------------------------
/ibaqpy/ibaqpyc.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from pathlib import Path
 3 | import click
 4 | from ibaqpy.commands.features2peptides import features2parquet
 5 | from ibaqpy.commands.peptides2protein import peptides2protein
 6 | from ibaqpy.commands.tsne_visualization import tsne_visualization
 7 | from ibaqpy.commands.correct_batches import correct_batches
 8 | 
 9 | import ibaqpy.__init__ as __init__
10 | 
11 | CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
12 | 
13 | 
14 | LOG_LEVELS = ["debug", "info", "warn"]
15 | LOG_LEVELS_TO_LEVELS = {
16 |     "debug": logging.DEBUG,
17 |     "info": logging.INFO,
18 |     "warn": logging.WARN,
19 | }
20 | 
21 | 
22 | @click.group(context_settings=CONTEXT_SETTINGS)
23 | @click.version_option(
24 |     version=__init__.__version__,
25 |     package_name="ibaqpy",
26 |     message="%(package)s %(version)s",
27 | )
28 | @click.option(
29 |     "-v",
30 |     "--log-level",
31 |     type=click.Choice(LOG_LEVELS, False),
32 |     default="debug",
33 |     help="Set the logging level.",
34 | )
35 | @click.option(
36 |     "--log-file",
37 |     type=click.Path(writable=True, path_type=Path),
38 |     required=False,
39 |     help="Write log to this file.",
40 | )
41 | def cli(log_level: str, log_file: Path):
42 |     """
43 |     Aggregrate and normalize quantitative proteomics using iBAQ (Intensity-Based Absolute Quantification)
44 |     for the quantms ecosystem.
45 |     """
46 | 
47 |     logging.basicConfig(
48 |         format="%(asctime)s [%(funcName)s] - %(message)s",
49 |         level=LOG_LEVELS_TO_LEVELS[log_level.lower()],
50 |     )
51 |     logging.captureWarnings(True)
52 | 
53 |     if log_file:
54 |         if not log_file.exists():
55 |             if not log_file.parent.exists():
56 |                 log_file.parent.mkdir(parents=True, exist_ok=True)
57 |             handler = logging.FileHandler(log_file)
58 |             handler.setLevel(LOG_LEVELS_TO_LEVELS[log_level.lower()])
59 |             handler.setFormatter(logging.Formatter("%(asctime)s [%(funcName)s] - %(message)s"))
60 |             logging.getLogger().addHandler(handler)
61 | 
62 | 
63 | cli.add_command(features2parquet)
64 | cli.add_command(peptides2protein)
65 | cli.add_command(tsne_visualization)
66 | cli.add_command(correct_batches)
67 | 
68 | 
69 | def main():
70 |     """
71 |     Main function to run the CLI
72 |     """
73 |     try:
74 |         cli()
75 |     except SystemExit as e:
76 |         if e.code != 0:
77 |             raise
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     main()
82 | 


--------------------------------------------------------------------------------
/ibaqpy/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/ibaqpy/model/__init__.py


--------------------------------------------------------------------------------
/ibaqpy/model/normalization.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable
  2 | from enum import Enum, auto
  3 | 
  4 | import pandas as pd
  5 | 
  6 | from ibaqpy.ibaq.ibaqpy_commons import CONDITION, NORM_INTENSITY, SAMPLE_ID, TECHREPLICATE
  7 | 
  8 | _method_registry: dict["FeatureNormalizationMethod", Callable[[pd.Series], pd.Series]] = {}
  9 | 
 10 | 
 11 | class FeatureNormalizationMethod(Enum):
 12 |     """
 13 |     FeatureNormalizationMethod is an enumeration of various methods for normalizing
 14 |     replicate intensities in a DataFrame. It provides functionality to register
 15 |     custom normalization functions and apply them to replicate data. The class
 16 |     supports normalization across multiple runs and samples, adjusting replicate
 17 |     intensities based on a sample-level average metric. Methods include NONE, Mean,
 18 |     Median, Max, Global, Max_Min, and IQR. The class also includes utility methods
 19 |     to convert from string representations and to normalize data using registered
 20 |     functions.
 21 |     """
 22 | 
 23 |     NONE = auto()
 24 | 
 25 |     Mean = auto()
 26 |     Median = auto()
 27 |     Max = auto()
 28 |     Global = auto()
 29 |     Max_Min = auto()
 30 |     IQR = auto()
 31 | 
 32 |     @classmethod
 33 |     def from_str(cls, name: str) -> "FeatureNormalizationMethod":
 34 |         """
 35 |         Get the normalization method from a string.
 36 |         Parameters:
 37 |         name: str The name of the normalization method.
 38 | 
 39 |         Returns:
 40 |         FeatureNormalizationMethod: The normalization method.
 41 |         """
 42 |         if name is None:
 43 |             return cls.NONE
 44 |         name_ = name.lower()
 45 |         for k, v in cls._member_map_.items():
 46 |             if k.lower() == name_:
 47 |                 return v
 48 |         raise KeyError(name)
 49 | 
 50 |     def register_replicate_fn(
 51 |         self, fn: Callable[[pd.Series], pd.Series]
 52 |     ) -> Callable[[pd.Series], pd.Series]:
 53 |         """
 54 |         Registers a custom normalization function for replicate intensities.
 55 | 
 56 |         Parameters:
 57 |             fn (Callable[[pd.Series], pd.Series]): A function that takes a pandas Series
 58 |             and returns a normalized pandas Series.
 59 | 
 60 |         Returns:
 61 |             Callable[[pd.Series], pd.Series]: The registered normalization function.
 62 |         """
 63 |         _method_registry[self] = fn
 64 |         return fn
 65 | 
 66 |     def normalize_replicates(self, df: pd.DataFrame, *args, **kwargs):
 67 |         """
 68 |         Normalize the replicate intensities in the given DataFrame using a registered
 69 |         normalization function.
 70 | 
 71 |         Parameters:
 72 |             df (pd.DataFrame): The DataFrame containing replicate intensity data.
 73 |             *args: Additional positional arguments for the normalization function.
 74 |             **kwargs: Additional keyword arguments for the normalization function.
 75 | 
 76 |         Returns:
 77 |             pd.Series: The normalized replicate intensities.
 78 |         """
 79 |         fn = _method_registry[self]
 80 |         return fn(df, *args, **kwargs)
 81 | 
 82 |     def normalize_sample(self, df, runs: list[str]) -> tuple[dict[str, pd.Series], float]:
 83 |         """
 84 |         Normalize replicate intensities for a given sample across multiple runs.
 85 | 
 86 |         Parameters:
 87 |             df (pd.DataFrame): The DataFrame containing replicate intensity data.
 88 |             runs (list[str]): A list of run identifiers for the sample.
 89 | 
 90 |         Returns:
 91 |             tuple[dict[str, pd.Series], float]: A dictionary mapping each run to its
 92 |             normalized replicate intensities and the average metric across all runs.
 93 |         """
 94 |         map_ = {}
 95 |         total = 0
 96 |         for run in runs:
 97 |             run = str(run)
 98 |             run_m = self.normalize_replicates(df.loc[df[TECHREPLICATE] == run, NORM_INTENSITY])
 99 |             map_[run] = run_m
100 |             total += run_m
101 |         sample_average_metric = total / len(runs)
102 |         return map_, sample_average_metric
103 | 
104 |     def normalize_runs(self, df: pd.DataFrame, technical_replicates: int):
105 |         """
106 |         Normalize the intensities of runs in the given DataFrame using a registered
107 | 
108 |         Parameters:
109 |         df: pd.DataFrame The DataFrame containing replicate intensity data.
110 |         technical_replicates: int The number of technical replicates for each sample.
111 | 
112 |         Returns:
113 |         pd.DataFrame: The DataFrame with normalized replicate intensities.
114 | 
115 |         """
116 |         if technical_replicates > 1:
117 |             samples = df[SAMPLE_ID].unique()
118 |             for sample in samples:
119 |                 runs = df.loc[df[SAMPLE_ID] == sample, TECHREPLICATE].unique().tolist()
120 |                 if len(runs) > 1:
121 |                     sample_df = df.loc[df[SAMPLE_ID] == sample, :]
122 | 
123 |                     replicate_metric_map, sample_average_metric = self.normalize_sample(
124 |                         sample_df, runs
125 |                     )
126 | 
127 |                     # For each replicate in each sample, normalize the per-replicate
128 |                     # intensity by a replicate-level statistic, relative to the sample
129 |                     # average over that replicate statistic.
130 |                     #
131 |                     # In effect, this scales runs down when the replicate average > sample average
132 |                     # and scales runs up when the replicate average < sample average.
133 |                     for run in runs:
134 |                         run = str(run)
135 |                         run_intensity = df.loc[
136 |                             (df[SAMPLE_ID] == sample) & (df[TECHREPLICATE] == run),
137 |                             NORM_INTENSITY,
138 |                         ]
139 |                         df.loc[
140 |                             (df[SAMPLE_ID] == sample) & (df[TECHREPLICATE] == run),
141 |                             NORM_INTENSITY,
142 |                         ] = run_intensity / (replicate_metric_map[run] / sample_average_metric)
143 |             return df
144 |         else:
145 |             return df
146 | 
147 |     def __call__(self, df: pd.DataFrame, technical_replicates: int):
148 |         return self.normalize_runs(df, technical_replicates)
149 | 
150 | 
151 | @FeatureNormalizationMethod.NONE.register_replicate_fn
152 | def no_normalization(df, *args, **kwargs):
153 |     """
154 |     No normalization is performed on the data.
155 |     Parameters:
156 |     df: pd.DataFrame The DataFrame containing replicate intensity data.
157 |     args: Additional positional arguments
158 |     kwargs: Additional keyword arguments
159 | 
160 |     Returns:
161 |     pd.DataFrame: The DataFrame containing the replicate intensity data.
162 | 
163 |     """
164 |     return df
165 | 
166 | 
167 | @FeatureNormalizationMethod.Mean.register_replicate_fn
168 | def mean_normalize(df, *args, **kwargs):
169 |     """
170 |     Mean normalization of the data.
171 | 
172 |     Parameters:
173 |     df: pd.DataFrame The DataFrame containing replicate intensity data.
174 |     args: Additional positional arguments
175 |     kwargs: Additional keyword arguments
176 | 
177 |     Returns:
178 |     pd.DataFrame: The DataFrame containing the normalized replicate intensity data.
179 | 
180 |     """
181 |     return df / df.mean()
182 | 
183 | 
184 | @FeatureNormalizationMethod.Median.register_replicate_fn
185 | def median_normalize(df, *args, **kwargs):
186 |     """
187 |     Median normalization of the data.
188 |     Parameters:
189 |     df: pd.DataFrame The DataFrame containing replicate intensity data.
190 |     args: Additional positional arguments
191 |     kwargs: Additional keyword arguments
192 | 
193 |     Returns:
194 |     pd.DataFrame: The DataFrame containing the normalized replicate intensity data.
195 | 
196 |     """
197 |     return df / df.median()
198 | 
199 | 
200 | @FeatureNormalizationMethod.Max.register_replicate_fn
201 | def max_normalize(df, *args, **kwargs):
202 |     """
203 |     Max normalization of the data.
204 |     Parameters:
205 |     df: pd.DataFrame The DataFrame containing replicate intensity data.
206 |     args: Additional positional arguments
207 |     kwargs: Additional keyword arguments
208 | 
209 |     Returns:
210 |     pd.DataFrame: The DataFrame containing the normalized replicate intensity data.
211 |     """
212 |     return df / df.max()
213 | 
214 | 
215 | @FeatureNormalizationMethod.Global.register_replicate_fn
216 | def global_normalize(df, *args, **kwargs):
217 |     """
218 |     Global normalization of the data.
219 |     Parameters:
220 |     df: pd.DataFrame The DataFrame containing replicate intensity data.
221 |     args: Additional positional arguments
222 |     kwargs: Additional keyword arguments
223 | 
224 |     Returns:
225 |     pd.DataFrame: The DataFrame containing the normalized replicate intensity data.
226 |     """
227 |     return df / df.sum()
228 | 
229 | 
230 | @FeatureNormalizationMethod.Max_Min.register_replicate_fn
231 | def max_min_normalize(df, *args, **kwargs):
232 |     """
233 |     Max-Min normalization of the data
234 |     Parameters:
235 |     df: pd.DataFrame The DataFrame containing replicate intensity data.
236 |     args: Additional positional arguments
237 |     kwargs: Additional keyword arguments
238 | 
239 |     Returns:
240 |     pd.DataFrame: The DataFrame containing the normalized replicate intensity data.
241 |     """
242 |     min_ = df.min()
243 |     return (df - min_) / (df.max() - min_)
244 | 
245 | 
246 | @FeatureNormalizationMethod.IQR.register_replicate_fn
247 | def iqr_normalization(df, *args, **kwargs):
248 |     """
249 |     IQR normalization of the data.
250 |     Parameters:
251 |     df: pd.DataFrame The DataFrame containing replicate intensity data.
252 |     args: Additional positional arguments
253 |     kwargs: Additional keyword arguments
254 | 
255 |     Returns:
256 |     pd.DataFrame: The DataFrame containing the normalized replicate intensity data.
257 |     """
258 |     return df.quantile([0.75, 0.25], interpolation="linear").mean()
259 | 
260 | 
261 | _peptide_method_registry = {}
262 | 
263 | 
264 | class PeptideNormalizationMethod(Enum):
265 |     """
266 |     Enum class for peptide normalization methods, providing functionality to register
267 |     and apply normalization functions to peptide data.
268 | 
269 |     Attributes:
270 |         NONE: No normalization.
271 |         GlobalMedian: Normalization using global median.
272 |         ConditionMedian: Normalization using condition-specific median.
273 | 
274 |     Methods:
275 |         from_str(name): Converts a string to a PeptideNormalizationMethod.
276 |         register_replicate_fn(fn): Registers a function for a specific normalization method.
277 |         normalize_sample(dataset_df, sample, med_map): Applies the registered normalization
278 |             function to a sample.
279 |         __call__(dataset_df, sample, med_map): Invokes normalize_sample method.
280 |     """
281 | 
282 |     NONE = auto()
283 | 
284 |     GlobalMedian = auto()
285 |     ConditionMedian = auto()
286 | 
287 |     @classmethod
288 |     def from_str(cls, name: str) -> "PeptideNormalizationMethod":
289 |         """
290 |         Converts a string to a PeptideNormalizationMethod.
291 |         Parameters:
292 |         name: str The name of the normalization method.
293 | 
294 |         Returns:
295 |         PeptideNormalizationMethod: The normalization method.
296 |         """
297 |         name_ = name.lower()
298 |         for k, v in cls._member_map_.items():
299 |             if k.lower() == name_:
300 |                 return v
301 |         raise KeyError(name)
302 | 
303 |     def register_replicate_fn(
304 |         self, fn: Callable[[pd.DataFrame, str, dict], pd.DataFrame]
305 |     ) -> Callable[[pd.DataFrame, str, dict], pd.DataFrame]:
306 |         """
307 |         Registers a function for a specific normalization method.
308 |         Parameters:
309 |         fn: Callable[[pd.DataFrame, str, dict], pd.DataFrame] The normalization function.
310 | 
311 |         Returns:
312 |         Callable[[pd.DataFrame, str, dict], pd.DataFrame]: The normalization function.
313 |         """
314 |         _peptide_method_registry[self] = fn
315 |         return fn
316 | 
317 |     def normalize_sample(self, dataset_df: pd.DataFrame, sample: str, med_map: dict):
318 |         """
319 |         Applies the registered normalization function to a sample.
320 |         Parameters:
321 |         dataset_df: pd.DataFrame The DataFrame containing peptide intensity data.
322 |         sample: str The sample identifier.
323 |         med_map: dict The median map.
324 | 
325 |         Returns:
326 |         pd.DataFrame: The DataFrame containing the normalized peptide intensity data.
327 |         """
328 |         fn = _peptide_method_registry[self]
329 |         return fn(dataset_df, sample, med_map)
330 | 
331 |     def __call__(self, dataset_df: pd.DataFrame, sample: str, med_map: dict):
332 |         """
333 |         Invokes the normalize_sample method.
334 |         Parameters:
335 |         dataset_df: pd.DataFrame The DataFrame containing peptide intensity data.
336 |         sample: str The sample identifier.
337 |         med_map: dict The median map.
338 | 
339 |         Returns:
340 |         pd.DataFrame: The DataFrame containing the normalized peptide intensity data.
341 |         """
342 |         return self.normalize_sample(dataset_df, sample, med_map)
343 | 
344 | 
345 | @PeptideNormalizationMethod.GlobalMedian.register_replicate_fn
346 | def global_median(dataset_df, sample: str, med_map: dict):
347 |     """
348 |     Global median normalization of the data.
349 |     Parameters:
350 |     dataset_df: pd.DataFrame The DataFrame containing peptide intensity data.
351 |     sample: str The sample identifier.
352 |     med_map: dict The median map.
353 | 
354 |     Returns:
355 |     pd.DataFrame: The DataFrame containing the normalized peptide intensity data.
356 |     """
357 |     dataset_df.loc[:, NORM_INTENSITY] = dataset_df[NORM_INTENSITY] / med_map[sample]
358 |     return dataset_df
359 | 
360 | 
361 | @PeptideNormalizationMethod.ConditionMedian.register_replicate_fn
362 | def condition_median(dataset_df, sample: str, med_map: dict):
363 |     """
364 |     Condition median normalization of the data.
365 |     Parameters:
366 |     dataset_df: pd.DataFrame The DataFrame containing peptide intensity data.
367 |     sample: str The sample identifier.
368 |     med_map: dict The median map.
369 | 
370 |     Returns:
371 |     pd.DataFrame: The DataFrame containing the normalized peptide intensity data.
372 |     """
373 |     con = dataset_df[CONDITION].unique()[0]
374 |     dataset_df.loc[:, NORM_INTENSITY] = dataset_df[NORM_INTENSITY] / med_map[con][sample]
375 | 
376 | 
377 | @PeptideNormalizationMethod.NONE.register_replicate_fn
378 | def peptide_no_normalization(dataset_df, sample, med_map):
379 |     """
380 |     No normalization is performed on the data.
381 |     Parameters:
382 |     dataset_df: pd.DataFrame The DataFrame containing peptide intensity data.
383 |     sample: str The sample identifier.
384 |     med_map: dict The median map.
385 | 
386 |     Returns:
387 |     pd.DataFrame: The DataFrame containing the peptide intensity data.
388 |     """
389 |     return dataset_df
390 | 


--------------------------------------------------------------------------------
/ibaqpy/model/organism_metadata.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from dataclasses import dataclass, field
 4 | from importlib.resources import open_text
 5 | from typing import ClassVar, Optional
 6 | 
 7 | 
 8 | @dataclass
 9 | class OrganismDescription:
10 |     """
11 |     Represents an organism's metadata, including its name, genome size, and histone proteins.
12 | 
13 |     Attributes
14 |     ----------
15 |         registry (ClassVar[dict[str, OrganismDescription]]): A class-level dictionary to store
16 |             registered organisms by their name in uppercase.
17 |         name (str): The name of the organism.
18 |         genome_size (int): The size of the organism's genome.
19 |         histone_proteins (list[str]): A list of histone proteins associated with the organism.
20 |         histone_entries (list[str]): A list of histone entries associated with the organism.
21 | 
22 |     Methods
23 |     -------
24 |         get(key, default=None) -> Optional[OrganismDescription]: Retrieves an organism description
25 |             from the registry using the given key, returning the default if not found.
26 |         registered_organisms(): Returns a list of all registered organism names.
27 |     """
28 | 
29 |     registry: ClassVar[dict[str, "OrganismDescription"]] = {}
30 | 
31 |     name: str
32 |     genome_size: int
33 |     histone_proteins: list[str] = field(default_factory=list, repr=False)
34 |     histone_entries: list[str] = field(default_factory=list, repr=False)
35 | 
36 |     @classmethod
37 |     def get(cls, key, default=None) -> "Optional[OrganismDescription]":
38 |         """
39 |         Retrieve an organism description from the registry using the given key.
40 | 
41 |         Parameters:
42 |             key (str): The name of the organism to retrieve, case-insensitive.
43 |             default (Optional[OrganismDescription]): The value to return if the organism is not found.
44 | 
45 |         Returns:
46 |             Optional[OrganismDescription]: The organism description if found, otherwise the default value.
47 |         """
48 |         return cls.registry.get(key.upper(), default)
49 | 
50 |     @classmethod
51 |     def registered_organisms(cls):
52 |         """
53 |         Return a list of all registered organism names.
54 | 
55 |         Returns:
56 |             dict_keys: A view of the registry's keys representing organism names.
57 |         """
58 |         return cls.registry.keys()
59 | 
60 |     def __post_init__(self):
61 |         """
62 |         Automatically register the organism in the class-level registry
63 |         using its name in uppercase as the key.
64 |         """
65 |         self.registry[self.name.upper()] = self
66 | 
67 | 
68 | for v in json.load(open_text("ibaqpy.data", "organisms.json")).values():
69 |     OrganismDescription(**v)
70 | 


--------------------------------------------------------------------------------
/ibaqpy/model/quantification_type.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum, auto
  2 | 
  3 | from collections.abc import Mapping
  4 | from dataclasses import dataclass, field
  5 | from typing import ClassVar, Iterator, Union, Optional
  6 | 
  7 | 
  8 | class QuantificationCategory(Enum):
  9 |     """
 10 |     An enumeration representing different quantification categories used in proteomics.
 11 | 
 12 |     Attributes:
 13 |         TMT: Represents Tandem Mass Tag quantification.
 14 |         ITRAQ: Represents Isobaric Tags for Relative and Absolute Quantitation.
 15 |         LFQ: Represents Label-Free Quantification.
 16 | 
 17 |     Methods:
 18 |         from_str(name: str) -> QuantificationCategory:
 19 |             Converts a string to a QuantificationCategory enum member.
 20 | 
 21 |         classify(labels: Union[list[str], set[str]]) -> tuple[Optional[QuantificationCategory], Optional[IsobaricLabel]]:
 22 |             Classifies a set of labels into a quantification category and determines the isobaric label scheme.
 23 |     """
 24 | 
 25 |     TMT = auto()
 26 |     ITRAQ = auto()
 27 |     LFQ = auto()
 28 | 
 29 |     @classmethod
 30 |     def from_str(cls, name: str) -> "QuantificationCategory":
 31 |         """
 32 |         Converts a string representation of a quantification category to its corresponding
 33 |         QuantificationCategory enum member.
 34 | 
 35 |         Parameters:
 36 |             name (str): The name of the quantification category.
 37 | 
 38 |         Returns:
 39 |             QuantificationCategory: The corresponding enum member.
 40 | 
 41 |         Raises:
 42 |             KeyError: If the provided name does not match any quantification category.
 43 |         """
 44 |         name_ = name.lower()
 45 |         for k, v in cls._member_map_.items():
 46 |             if k.lower() == name_:
 47 |                 return v
 48 |         raise KeyError(name)
 49 | 
 50 |     @classmethod
 51 |     def classify(
 52 |         cls, labels: Union[list[str], set[str]]
 53 |     ) -> tuple["Optional[QuantificationCategory]", "Optional[IsobaricLabel]"]:
 54 |         """
 55 |         Classifies a set of labels into a quantification category and determines the isobaric label scheme.
 56 | 
 57 |         Parameters:
 58 |             labels (Union[list[str], set[str]]): A collection of label strings to classify.
 59 | 
 60 |         Returns:
 61 |             tuple[Optional[QuantificationCategory], Optional[IsobaricLabel]]:
 62 |             A tuple containing the quantification category and the isobaric label scheme, if applicable.
 63 | 
 64 |         Raises:
 65 |             ValueError: If the labels do not correspond to a known quantification category.
 66 |         """
 67 |         label_scheme = None
 68 | 
 69 |         if len(labels) == 1 and any("label free" in s.lower() for s in labels):
 70 |             label_category = cls.LFQ
 71 | 
 72 |         elif any("tmt" in s.lower() for s in labels):
 73 |             label_category = cls.TMT
 74 |             if (
 75 |                 len(labels) > 11
 76 |                 or "TMT134N" in labels
 77 |                 or "TMT133C" in labels
 78 |                 or "TMT133N" in labels
 79 |                 or "TMT132C" in labels
 80 |                 or "TMT132N" in labels
 81 |             ):
 82 |                 label_scheme = IsobaricLabel.TMT16plex
 83 |             elif len(labels) == 11 or "TMT131C" in labels:
 84 |                 label_scheme = IsobaricLabel.TMT11plex
 85 |             elif len(labels) > 6:
 86 |                 label_scheme = IsobaricLabel.TMT10plex
 87 |             else:
 88 |                 label_scheme = IsobaricLabel.TMT6plex
 89 | 
 90 |         elif any("itraq" in s.lower() for s in labels):
 91 |             label_category = cls.ITRAQ
 92 |             if len(labels) > 4:
 93 |                 label_scheme = IsobaricLabel.ITRAQ8plex
 94 |             else:
 95 |                 label_scheme = IsobaricLabel.ITRAQ4plex
 96 | 
 97 |         else:
 98 |             raise ValueError(
 99 |                 f"Cannot infer labeling scheme from {labels}, only support label free, TMT and ITRAQ experiment!"
100 |             )
101 |         return label_category, label_scheme
102 | 
103 | 
104 | class IsobaricLabel(Enum):
105 |     """
106 |     An enumeration for different isobaric labeling schemes used in proteomics.
107 | 
108 |     Attributes:
109 |         TMT6plex: Represents the TMT 6-plex labeling scheme.
110 |         TMT10plex: Represents the TMT 10-plex labeling scheme.
111 |         TMT11plex: Represents the TMT 11-plex labeling scheme.
112 |         TMT16plex: Represents the TMT 16-plex labeling scheme.
113 |         ITRAQ4plex: Represents the ITRAQ 4-plex labeling scheme.
114 |         ITRAQ8plex: Represents the ITRAQ 8-plex labeling scheme.
115 | 
116 |     Methods:
117 |         from_str(name: str) -> IsobaricLabel:
118 |             Converts a string to an IsobaricLabel enum member.
119 |         channels() -> IsobaricLabelSpec:
120 |             Retrieves the channel specifications for the isobaric label.
121 |     """
122 | 
123 |     TMT6plex = auto()
124 |     TMT10plex = auto()
125 |     TMT11plex = auto()
126 |     TMT16plex = auto()
127 | 
128 |     ITRAQ4plex = auto()
129 |     ITRAQ8plex = auto()
130 | 
131 |     @classmethod
132 |     def from_str(cls, name: str) -> "IsobaricLabel":
133 |         """
134 |         Converts a string representation of a quantification category to its corresponding
135 |         QuantificationCategory enum member.
136 | 
137 |         Parameters:
138 |             name (str): The name of the quantification category.
139 | 
140 |         Returns:
141 |             QuantificationCategory: The corresponding enum member.
142 | 
143 |         Raises:
144 |             KeyError: If the provided name does not match any quantification category.
145 |         """
146 |         name_ = name.lower()
147 |         for k, v in cls._member_map_.items():
148 |             if k.lower() == name_:
149 |                 return v
150 |         raise KeyError(name)
151 | 
152 |     def channels(self) -> "IsobaricLabelSpec":
153 |         """
154 |         Retrieves the channel specifications associated with the isobaric label.
155 | 
156 |         Returns:
157 |             IsobaricLabelSpec: The channel specifications for the current isobaric label.
158 |         """
159 |         return IsobaricLabelSpec.registry[self.name]
160 | 
161 | 
162 | @dataclass
163 | class IsobaricLabelSpec(Mapping[str, int]):
164 |     """
165 |     A data class representing the specifications of isobaric labels, including their
166 |     name and channel mappings. This class supports dictionary-like access to channel
167 |     information and maintains a registry of all instances.
168 | 
169 |     Attributes:
170 |         registry (ClassVar[dict[str, IsobaricLabelSpec]]): A class-level registry of all
171 |             isobaric label specifications.
172 |         name (str): The name of the isobaric label.
173 |         channels (dict[str, int]): A mapping of channel names to their respective indices.
174 | 
175 |     Methods:
176 |         __post_init__(): Registers the instance in the class-level registry.
177 |         id: Retrieves the corresponding IsobaricLabel enum member for the label name.
178 |         __getitem__(key: str) -> int: Returns the index of the specified channel.
179 |         __iter__() -> Iterator[str]: Iterates over the channel names.
180 |         __len__() -> int: Returns the number of channels.
181 |         __contains__(key) -> bool: Checks if a channel name exists in the channels.
182 |     """
183 | 
184 |     registry: ClassVar[dict[str, "IsobaricLabelSpec"]] = {}
185 | 
186 |     name: str
187 |     channels: dict[str, int] = field(default_factory=dict)
188 | 
189 |     def __post_init__(self):
190 |         self.registry[self.name] = self
191 | 
192 |     @property
193 |     def id(self):
194 |         try:
195 |             return IsobaricLabel[self.name]
196 |         except ValueError:
197 |             return None
198 | 
199 |     def __getitem__(self, key: str) -> int:
200 |         return self.channels[key]
201 | 
202 |     def __iter__(self) -> Iterator[str]:
203 |         yield from self.channels
204 | 
205 |     def __len__(self) -> int:
206 |         return len(self.channels)
207 | 
208 |     def __contains__(self, key) -> bool:
209 |         return key in self.channels
210 | 
211 | 
212 | TMT16plex = IsobaricLabelSpec(
213 |     "TMT16plex",
214 |     {
215 |         "TMT126": 1,
216 |         "TMT127N": 2,
217 |         "TMT127C": 3,
218 |         "TMT128N": 4,
219 |         "TMT128C": 5,
220 |         "TMT129N": 6,
221 |         "TMT129C": 7,
222 |         "TMT130N": 8,
223 |         "TMT130C": 9,
224 |         "TMT131N": 10,
225 |         "TMT131C": 11,
226 |         "TMT132N": 12,
227 |         "TMT132C": 13,
228 |         "TMT133N": 14,
229 |         "TMT133C": 15,
230 |         "TMT134N": 16,
231 |     },
232 | )
233 | 
234 | TMT11plex = IsobaricLabelSpec(
235 |     "TMT11plex",
236 |     {
237 |         "TMT126": 1,
238 |         "TMT127N": 2,
239 |         "TMT127C": 3,
240 |         "TMT128N": 4,
241 |         "TMT128C": 5,
242 |         "TMT129N": 6,
243 |         "TMT129C": 7,
244 |         "TMT130N": 8,
245 |         "TMT130C": 9,
246 |         "TMT131N": 10,
247 |         "TMT131C": 11,
248 |     },
249 | )
250 | 
251 | TMT10plex = IsobaricLabelSpec(
252 |     "TMT10plex",
253 |     {
254 |         "TMT126": 1,
255 |         "TMT127N": 2,
256 |         "TMT127C": 3,
257 |         "TMT128N": 4,
258 |         "TMT128C": 5,
259 |         "TMT129N": 6,
260 |         "TMT129C": 7,
261 |         "TMT130N": 8,
262 |         "TMT130C": 9,
263 |         "TMT131": 10,
264 |     },
265 | )
266 | 
267 | TMT6plex = IsobaricLabelSpec(
268 |     "TMT6plex",
269 |     {
270 |         "TMT126": 1,
271 |         "TMT127": 2,
272 |         "TMT128": 3,
273 |         "TMT129": 4,
274 |         "TMT130": 5,
275 |         "TMT131": 6,
276 |     },
277 | )
278 | 
279 | ITRAQ4plex = IsobaricLabelSpec(
280 |     "ITRAQ4plex", {"ITRAQ114": 1, "ITRAQ115": 2, "ITRAQ116": 3, "ITRAQ117": 4}
281 | )
282 | 
283 | ITRAQ8plex = IsobaricLabelSpec(
284 |     "ITRAQ8plex",
285 |     {
286 |         "ITRAQ113": 1,
287 |         "ITRAQ114": 2,
288 |         "ITRAQ115": 3,
289 |         "ITRAQ116": 4,
290 |         "ITRAQ117": 5,
291 |         "ITRAQ118": 6,
292 |         "ITRAQ119": 7,
293 |         "ITRAQ121": 8,
294 |     },
295 | )
296 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "ibaqpy"
 3 | description = "Python package to compute intensity base absolute expression values"
 4 | readme = "README.md"
 5 | license = "MIT"
 6 | version = "0.0.5"
 7 | authors = [
 8 |     "Yasset Perez-Riverol <ypriverol@gmail.com>",
 9 |     "Dai Chengxin <chengxin2024@126.com>",
10 |     "Julianus Pfeuffer <jule.pf@gmail.com>",
11 |     "Joshua Klein <joshua.adam.klein@gmail.com>",
12 |     "Enrique Audain <enrique.audain.martinez@uni-oldenburg.de>",
13 |     "Ping Zheng <zprobot80@gmail.com>"
14 | ]
15 | keywords = [
16 |     "quantms",
17 |     "proteomics",
18 |     "mass-spectrometry",
19 |     "data-analysis",
20 |     "big data"
21 | ]
22 | classifiers = [
23 |     "Intended Audience :: Science/Research",
24 |     "License :: OSI Approved :: MIT License",
25 |     "Operating System :: OS Independent",
26 |     "Programming Language :: Python :: 3 :: Only",
27 |     "Topic :: Scientific/Engineering :: Bio-Informatics",
28 |     "Development Status :: 5 - Production/Stable"
29 | ]
30 | packages = [
31 |     { include = "ibaqpy" }
32 | ]
33 | 
34 | [tool.poetry.dependencies]
35 | python = ">=3.9"
36 | scikit-learn = "*"
37 | pyopenms = "*"
38 | numpy = "<2.1.0"
39 | click = "*"
40 | pandas = "*"
41 | matplotlib = "*"
42 | pyarrow = ">=16.1.0"
43 | duckdb = ">=0.10.1"
44 | qnorm  = "*"
45 | scipy  = ">=1.10"
46 | seaborn = ">=0.13.2"
47 | typing_extensions = ">=4.6.3"
48 | inmoose = "*"
49 | 
50 | [tool.poetry.urls]
51 | GitHub = "https://github.com/bigbio/ibaqpy/"
52 | PyPi = "https://pypi.org/project/ibaqpy/"
53 | Quantms = "https://quantms.org"
54 | LICENSE = "https://github.com/bigbio/ibaqpy/blob/main/LICENSE"
55 | 
56 | [tool.poetry.scripts]
57 | ibaqpyc = "ibaqpy.ibaqpyc:main"
58 | 
59 | [tool.isort]
60 | profile = "black"
61 | 
62 | [tool.black]
63 | line-length = 99
64 | target-version = ["py39"]
65 | 
66 | [build-system]
67 | requires = ["poetry-core>=1.2.0"]
68 | build-backend = "poetry.core.masonry.api"


--------------------------------------------------------------------------------
/qodana.yaml:
--------------------------------------------------------------------------------
1 | version: "1.0"
2 | linter: jetbrains/qodana-jvm:2024.2
3 | profile:
4 |   name: qodana.recommended
5 | include:
6 |   - name: CheckDependencyLicenses
7 | 


--------------------------------------------------------------------------------
/recipe/conda_build_config.yaml:
--------------------------------------------------------------------------------
1 | python:
2 |   - 3.11
3 | 


--------------------------------------------------------------------------------
/recipe/meta.yaml:
--------------------------------------------------------------------------------
 1 | # recipe/meta.yaml
 2 | package:
 3 |   name: ibaqpy
 4 |   version: "0.0.5"
 5 | 
 6 | source:
 7 |   path: ../
 8 | 
 9 | build:
10 |   entry_points:
11 |     - ibaqpyc=ibaqpy.ibaqpyc:main
12 |   run_exports:
13 |     - {{ pin_subpackage('ibaqpy', max_pin="x.x") }}
14 |   script: "{{ PYTHON }} -m pip install . --no-deps --no-build-isolation --no-cache-dir -vvv"
15 |   number: 0
16 |   noarch: python
17 | 
18 | requirements:
19 |   host:
20 |     - python
21 |     - pip
22 |     - poetry-core >=1.2.0
23 |   run:
24 |     - python>=3.9
25 |     - scikit-learn
26 |     - pyopenms
27 |     - numpy<2.1.0
28 |     - click
29 |     - pandas
30 |     - matplotlib
31 |     - pyarrow>=16.1.0
32 |     - duckdb>=0.10.1
33 |     - qnorm
34 |     - scipy>=1.10
35 |     - seaborn>=0.13.2
36 |     - typing_extensions>=4.6.3
37 |     - inmoose
38 | test:
39 |   imports:
40 |     - ibaqpy
41 |   commands:
42 |     - ibaqpyc --help
43 | 
44 | about:
45 |   home: https://www.github.com/bigbio/ibaqpy
46 |   summary: Python package to compute intensity base absolute expression values
47 |   license: MIT
48 |   license_file: LICENSE
49 |   dev_url: https://www.github.com/bigbio/ibaqpy
50 | 
51 | extra:
52 |   recipe-maintainers:
53 |     - ypriverol


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | scikit-learn~=1.6.1
 2 | pyopenms
 3 | numpy<2.1.0
 4 | click~=8.1.3
 5 | pandas~=2.0.1
 6 | matplotlib~=3.7.1
 7 | pyarrow>=16.1.0
 8 | duckdb>=0.10.1
 9 | qnorm
10 | scipy>=1.10
11 | seaborn>=0.13.2
12 | typing_extensions>=4.6.3
13 | inmoose
14 | pytest~=8.3.4
15 | anndata~=0.10.9


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/tests/__init__.py


--------------------------------------------------------------------------------
/tests/example/feature.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbio/ibaqpy/8d33657b81ae3c8bfc9dbea3c3617ad6088181cf/tests/example/feature.parquet


--------------------------------------------------------------------------------
/tests/example/out/.gitignore:
--------------------------------------------------------------------------------
1 | PXD*
2 | *.pdf


--------------------------------------------------------------------------------
/tests/test_batch_correction.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from pathlib import Path
 3 | import pytest
 4 | import anndata
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from ibaqpy.commands.correct_batches import run_batch_correction
 9 | from ibaqpy.ibaq.ibaqpy_commons import SAMPLE_ID, PROTEIN_NAME, IBAQ, IBAQ_BEC
10 | 
11 | TESTS_DIR = Path(__file__).parent
12 | 
13 | logger = logging.getLogger(__name__)
14 | logger.addHandler(logging.NullHandler())
15 | 
16 | 
17 | def test_correct_batches():
18 |     """
19 |     Test the `run_batch_correction` function to ensure it correctly processes iBAQ values
20 |     from TSV files, generates the expected output files, and handles various error cases.
21 | 
22 |     This test verifies:
23 |     - The creation and non-emptiness of the corrected output TSV file.
24 |     - The creation and correct shape of the AnnData object.
25 |     - Handling of invalid sample IDs by raising a ValueError.
26 |     - Handling of missing required columns by raising a ValueError.
27 |     - Handling of invalid file patterns by raising a ValueError.
28 |     """
29 |     args = {
30 |         "folder": TESTS_DIR / "ibaq-raw-hela",
31 |         "pattern": "*ibaq.tsv",
32 |         "comment": "#",
33 |         "sep": "\t",
34 |         "output": TESTS_DIR / "example/ibaq_corrected_combined.tsv",
35 |         "sample_id_column": SAMPLE_ID,
36 |         "protein_id_column": PROTEIN_NAME,
37 |         "ibaq_raw_column": IBAQ,
38 |         "ibaq_corrected_column": IBAQ_BEC,
39 |         "export_anndata": True,
40 |     }
41 |     logging.debug("Arguments for run_batch_correction: %s", args)
42 |     run_batch_correction(**args)
43 | 
44 |     # Assert the output file is created and not empty
45 |     output_path = Path(args["output"])
46 |     assert output_path.exists(), f"Expected output file {output_path} was not created."
47 |     df = pd.read_csv(output_path, sep=args["sep"])
48 |     assert not df.empty, "The corrected output file is empty."
49 | 
50 |     # Assert the AnnData object is created
51 |     adata_path = output_path.with_suffix(".h5ad")
52 |     assert adata_path.exists(), f"Expected AnnData file {adata_path} was not created."
53 | 
54 |     # Read the AnnData object and check shape and layers
55 |     adata = anndata.read_h5ad(adata_path)
56 |     logger.info(adata)
57 |     assert adata.shape == (46, 3476)
58 |     assert adata.layers[IBAQ_BEC].shape == (46, 3476)
59 | 
60 |     # Test invalid sample IDs
61 |     with pytest.raises(ValueError):
62 |         args["folder"] = TESTS_DIR / "invalid-samples"
63 |         run_batch_correction(**args)
64 | 
65 |     # Test missing required columns
66 |     with pytest.raises(ValueError):
67 |         args["folder"] = TESTS_DIR / "ibaq-raw-hela"
68 |         args["sample_id_column"] = "NonexistentColumn"
69 |         run_batch_correction(**args)
70 | 
71 |     # Test invalid file pattern
72 |     with pytest.raises(ValueError):
73 |         args["pattern"] = "nonexistent*.tsv"
74 |         run_batch_correction(**args)
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     test_correct_batches()
79 | 


--------------------------------------------------------------------------------
/tests/test_file_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from pathlib import Path
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from ibaqpy.ibaq.file_utils import create_anndata, combine_ibaq_tsv_files
 7 | from ibaqpy.ibaq.ibaqpy_commons import (
 8 |     SAMPLE_ID,
 9 |     PROTEIN_NAME,
10 |     IBAQ,
11 |     IBAQ_NORMALIZED,
12 |     IBAQ_LOG,
13 | )
14 | 
15 | TESTS_DIR = Path(__file__).parent
16 | 
17 | 
18 | def test_combine_ibaq_tsv_files():
19 |     """
20 |     Test functions for combining iBAQ TSV files and creating AnnData objects.
21 | 
22 |     Functions:
23 |     - test_combine_ibaq_tsv_files: Tests the combination of multiple iBAQ TSV files
24 |       into a single DataFrame and verifies the shape of the resulting DataFrame.
25 |     - test_create_anndata: Tests the creation of an AnnData object from a DataFrame
26 |       with specified observation and variable columns, additional layers, and metadata.
27 |     """
28 |     ibaq_dir = TESTS_DIR / "ibaq-raw-hela"
29 |     files_pattern = "*ibaq.tsv"
30 |     df_ibaq = combine_ibaq_tsv_files(dir_path=str(ibaq_dir), pattern=files_pattern, sep="\t")
31 |     logging.info(df_ibaq.head())
32 |     assert df_ibaq.shape == (83725, 14)
33 | 
34 | 
35 | def test_create_anndata():
36 |     """
37 |     Test functions for combining iBAQ TSV files and creating AnnData objects.
38 | 
39 |     Functions:
40 |     - test_combine_ibaq_tsv_files: Tests the combination of multiple iBAQ TSV files
41 |       into a single DataFrame and verifies the shape of the resulting DataFrame.
42 |     - test_create_anndata: Tests the creation of an AnnData object from a DataFrame
43 |       with specified observation and variable columns, additional layers, and metadata.
44 |     """
45 |     df = pd.read_csv(TESTS_DIR / "ibaq-raw-hela/PXD000396.ibaq.tsv", sep="\t")
46 |     obs_col = SAMPLE_ID
47 |     var_col = PROTEIN_NAME
48 |     value_col = IBAQ
49 |     layers = [IBAQ_NORMALIZED, IBAQ_LOG]
50 |     adata = create_anndata(
51 |         df=df,
52 |         obs_col=obs_col,
53 |         var_col=var_col,
54 |         value_col=value_col,
55 |         layer_cols=layers,
56 |         obs_metadata_cols=["Condition"],
57 |         var_metadata_cols=[],
58 |     )
59 |     logging.info(adata)
60 |     assert adata.shape == (12, 3096)
61 |     assert adata.layers[IBAQ_NORMALIZED].shape == (12, 3096)
62 |     assert adata.layers[IBAQ_LOG].shape == (12, 3096)
63 |     assert "HeLa" in adata.obs["Condition"].values
64 | 


--------------------------------------------------------------------------------
/tests/test_ibaqpy.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from ibaqpy.ibaq.peptides2protein import peptides_to_protein
 4 | 
 5 | from pathlib import Path
 6 | 
 7 | TESTS_DIR = Path(__file__).parent
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | logger.addHandler(logging.NullHandler())
11 | 
12 | 
13 | def test_ibaq_compute():
14 |     """
15 |     Test the computation of IBAQ values using the peptides_to_protein function.
16 | 
17 |     This test sets up the necessary arguments, including paths to input files,
18 |     enzyme type, normalization options, and output paths, and then calls the
19 |     peptides_to_protein function to perform the computation. It prints the
20 |     arguments for verification before execution.
21 | 
22 |     The test uses example data files located in the 'example' directory and
23 |     outputs results to the 'out' directory.
24 |     """
25 |     args = {
26 |         "fasta": str(
27 |             TESTS_DIR / "example/Homo-sapiens-uniprot-reviewed-contaminants-decoy-202210.fasta"
28 |         ),
29 |         "peptides": str(TESTS_DIR / "example/PXD017834-peptides.csv"),
30 |         "enzyme": "Trypsin",
31 |         "normalize": True,
32 |         "min_aa": 7,
33 |         "max_aa": 30,
34 |         "tpa": True,
35 |         "ruler": True,
36 |         "ploidy": 2,
37 |         "cpc": 200,
38 |         "organism": "human",
39 |         "output": str(TESTS_DIR / "example" / "out" / "PXD017834-ibaq.tsv"),
40 |         "verbose": True,
41 |         "qc_report": str(TESTS_DIR / "example/out/QCprofile.pdf"),
42 |     }
43 |     logger.info(args)
44 |     peptides_to_protein(**args)
45 | 


--------------------------------------------------------------------------------
/tests/test_ibaqpy_postprocessing.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import pandas as pd
 3 | 
 4 | from ibaqpy.ibaq.ibaqpy_commons import SAMPLE_ID, IBAQ_NORMALIZED
 5 | from ibaqpy.ibaq.ibaqpy_postprocessing import (
 6 |     remove_samples_low_protein_number,
 7 |     remove_missing_values,
 8 |     describe_expression_metrics,
 9 | )
10 | import logging
11 | 
12 | TESTS_DIR = Path(__file__).parent
13 | 
14 | 
15 | def test_remove_samples_low_protein_number():
16 |     """
17 |     Test functions for post-processing iBAQ data.
18 | 
19 |     These tests validate the functionality of the following operations:
20 |     - Removing samples with a low number of proteins.
21 |     - Removing samples with a high percentage of missing values.
22 |     - Describing expression metrics across samples.
23 | 
24 |     Each test reads a sample iBAQ dataset, applies the respective function,
25 |     and logs the number of samples before and after processing.
26 |     """
27 |     ibaq_test = TESTS_DIR / "example/PXD017834-example-ibaq.tsv"
28 |     ibaq_df = pd.read_csv(ibaq_test, sep="\t")
29 |     number_samples = len(ibaq_df[SAMPLE_ID].unique())
30 |     logging.info("The number of samples in the dataframe {}".format(number_samples))
31 | 
32 |     new_ibaq = remove_samples_low_protein_number(ibaq_df, min_protein_num=286)
33 | 
34 |     number_samples = len(new_ibaq[SAMPLE_ID].unique())
35 |     logging.info(
36 |         "The number of samples with number of proteins higher than 286 is {}".format(
37 |             number_samples
38 |         )
39 |     )
40 | 
41 | 
42 | def test_remove_missing_values():
43 |     """
44 |     Test functions for post-processing iBAQ data.
45 | 
46 |     These tests validate the functionality of the following operations:
47 |     - Removing samples with a low number of proteins.
48 |     - Removing samples with a high percentage of missing values.
49 |     - Describing expression metrics across samples.
50 | 
51 |     Each test reads a sample iBAQ dataset, applies the respective function,
52 |     and logs the number of samples before and after processing.
53 |     """
54 |     ibaq_test = TESTS_DIR / "example/PXD017834-example-ibaq.tsv"
55 |     ibaq_df = pd.read_csv(ibaq_test, sep="\t")
56 |     number_samples = len(ibaq_df[SAMPLE_ID].unique())
57 |     logging.info("The number of samples in the dataframe {}".format(number_samples))
58 |     new_ibaq = remove_missing_values(
59 |         ibaq_df, missingness_percentage=1, expression_column=IBAQ_NORMALIZED
60 |     )
61 |     number_samples = len(new_ibaq[SAMPLE_ID].unique())
62 |     logging.info(
63 |         "The number of samples with less than 1% of missing values is {}".format(number_samples)
64 |     )
65 | 
66 | 
67 | def test_describe_expression_metrics():
68 |     """
69 |     Test functions for post-processing iBAQ data.
70 | 
71 |     These tests validate the functionality of the following operations:
72 |     - Removing samples with a low number of proteins.
73 |     - Removing samples with a high percentage of missing values.
74 |     - Describing expression metrics across samples.
75 | 
76 |     Each test reads a sample iBAQ dataset, applies the respective function,
77 |     and logs the number of samples before and after processing.
78 |     """
79 |     ibaq_test = TESTS_DIR / "example/PXD017834-example-ibaq.tsv"
80 |     ibaq_df = pd.read_csv(ibaq_test, sep="\t")
81 | 
82 |     metrics = describe_expression_metrics(ibaq_df)
83 |     logging.info(metrics)
84 | 


--------------------------------------------------------------------------------
/tests/test_peptide_normalize.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from ibaqpy.ibaq.peptide_normalization import peptide_normalization
 4 | from pathlib import Path
 5 | 
 6 | TESTS_DIR = Path(__file__).parent
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | logger.addHandler(logging.NullHandler())
10 | 
11 | 
12 | def test_feature_assembly():
13 |     """
14 |     Test the peptide normalization process by setting up arguments for the
15 |     `peptide_normalization` function and executing it. This test checks the
16 |     function's ability to process a feature table from a parquet file and an
17 |     SDRF file, applying various filtering and normalization steps, and saving
18 |     the output to a CSV file. It ensures that the output file is removed before
19 |     the test to avoid conflicts.
20 | 
21 |     The test uses the following parameters:
22 |     - parquet: Path to the input parquet file containing feature data.
23 |     - sdrf: Path to the SDRF file for experimental metadata.
24 |     - min_aa: Minimum number of amino acids required for peptides.
25 |     - min_unique: Minimum number of unique peptides required for proteins.
26 |     - remove_ids: Path to a file with protein IDs to remove, if any.
27 |     - remove_decoy_contaminants: Flag to remove decoy and contaminant proteins.
28 |     - remove_low_frequency_peptides: Flag to remove low-frequency peptides.
29 |     - output: Path to the output CSV file for normalized peptide intensities.
30 |     - skip_normalization: Flag to skip the normalization process.
31 |     - nmethod: Method for feature-level normalization.
32 |     - pnmethod: Method for peptide-level normalization.
33 |     - log2: Flag to apply log2 transformation to intensities.
34 |     - save_parquet: Flag to save the output as a parquet file.
35 |     """
36 | 
37 |     args = {
38 |         "parquet": str(TESTS_DIR / "example/feature.parquet"),
39 |         "sdrf": str(TESTS_DIR / "example/PXD017834-TMT.sdrf.tsv"),
40 |         "min_aa": 7,
41 |         "min_unique": 2,
42 |         "remove_ids": None,
43 |         "remove_decoy_contaminants": True,
44 |         "remove_low_frequency_peptides": True,
45 |         "output": str(TESTS_DIR / "example" / "out" / "PXD017834-peptides-norm.csv"),
46 |         "skip_normalization": False,
47 |         "nmethod": "median",
48 |         "pnmethod": "none",
49 |         "log2": True,
50 |         "save_parquet": True,
51 |     }
52 |     logger.info(args)
53 |     out = Path(args["output"])
54 |     if out.exists():
55 |         out.unlink()
56 |     peptide_normalization(**args)
57 | 


--------------------------------------------------------------------------------