├── .coveragerc
├── .flake8
├── .gitignore
├── .readthedocs.yml
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── conftest.py
├── docs
├── Makefile
├── _templates
│ └── autosummary
│ │ ├── base.rst
│ │ ├── class.rst
│ │ └── module.rst
├── api.rst
├── bokeh_plots.py
├── chem.rst
├── conf.py
├── data-curation.rst
├── descriptors.csv
├── examples
│ ├── DARTMS_MTBLS1198_SeaOmics__processing.ipynb
│ ├── DARTMS_processing.ipynb
│ ├── DARTMS_processing_ParameterOptimization.ipynb
│ ├── custom_peak_descriptors.py
│ ├── defined_spots_supervised.tsv
│ └── roi-creation.py
├── feature-correspondence.rst
├── fileio.rst
├── fileio_tutorial.rst
├── glossary.rst
├── index.rst
├── installation.rst
├── mzml.rst
├── peak-picking.rst
├── plots
│ ├── dbscan-clustering.py
│ ├── dbscan-parameters.py
│ ├── gmm-clustering.py
│ ├── peak-definition.py
│ ├── peak-detection-example.py
│ ├── peak_detection_baseline_example.py
│ └── roi-definition.py
├── preprocessing-steps.csv
├── processing_datasets.rst
├── quickstart.rst
├── requirements.txt
└── tutorials.rst
├── pyproject.toml
├── requirements.txt
├── src
└── tidyms
│ ├── __init__.py
│ ├── _batch_corrector.py
│ ├── _build_data_matrix.py
│ ├── _constants.py
│ ├── _filter_functions.py
│ ├── _mzml.py
│ ├── _plot_bokeh.py
│ ├── annotation
│ ├── __init__.py
│ ├── annotation.py
│ ├── annotation_data.py
│ ├── envelope_finder.py
│ └── mmi_finder.py
│ ├── assay.py
│ ├── chem
│ ├── __init__.py
│ ├── _envelope_utils.py
│ ├── _formula_generator.py
│ ├── atoms.py
│ ├── elements.json
│ ├── envelope_tools.py
│ ├── formula.py
│ ├── isotopes.json
│ └── utils.py
│ ├── consensus_annotation.py
│ ├── container.py
│ ├── correspondence.py
│ ├── dartms.py
│ ├── fileio.py
│ ├── fill_missing.py
│ ├── filter.py
│ ├── lcms.py
│ ├── peaks.py
│ ├── raw_data_utils.py
│ ├── simulation.py
│ ├── utils.py
│ └── validation.py
├── test_requirements.txt
├── tests
├── __init__.py
├── conftest.py
├── integration
│ ├── test_assay_real_data.py
│ └── test_real_raw_data.py
└── unit
│ ├── annotation
│ ├── test_annotation.py
│ ├── test_envelope_finder.py
│ └── test_mmi_finder.py
│ ├── test_assay.py
│ ├── test_batch_corrector.py
│ ├── test_build_data_matrix.py
│ ├── test_chem
│ ├── test_atoms.py
│ ├── test_formula.py
│ ├── test_formula_generator.py
│ ├── test_isotope_distributions.py
│ └── test_isotope_scorer.py
│ ├── test_consensus_annotation.py
│ ├── test_correspondence.py
│ ├── test_data_container.py
│ ├── test_fileio.py
│ ├── test_fill_missing.py
│ ├── test_filter.py
│ ├── test_lcms.py
│ ├── test_peaks.py
│ ├── test_raw_data_utils.py
│ ├── test_utils.py
│ └── test_validation.py
└── tox.ini
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit =
3 | tidyms/validation.py
4 | tidyms/_plot_bokeh.py
5 |
6 | [report]
7 | exclude_lines =
8 | def plot
9 | pragma: no cover
10 | def __repr__
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 88
3 | extend-ignore = E203, E501
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # C generated files by Cython
10 | *.c
11 |
12 | # Distribution / packaging
13 | .Python
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | pip-wheel-metadata/
27 | share/python-wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | MANIFEST
32 |
33 | # PyInstaller
34 | # Usually these files are written by a python script from a template
35 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 |
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 |
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .nox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *.cover
53 | .hypothesis/
54 | .pytest_cache/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 | docs/generated/
75 | docs/_static/*.html
76 |
77 | # PyBuilder
78 | target/
79 |
80 | # Jupyter Notebook
81 | .ipynb_checkpoints
82 |
83 | # IPython
84 | profile_default/
85 | ipython_config.py
86 |
87 | # pyenv
88 | .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # celery beat schedule file
98 | celerybeat-schedule
99 |
100 | # SageMath parsed files
101 | *.sage.py
102 |
103 | # Environments
104 | .env
105 | .venv
106 | env/
107 | venv/
108 | ENV/
109 | env.bak/
110 | venv.bak/
111 |
112 | # Spyder project settings
113 | .spyderproject
114 | .spyproject
115 |
116 | # Rope project settings
117 | .ropeproject
118 |
119 | # mkdocs documentation
120 | /site
121 |
122 | # mypy
123 | .mypy_cache/
124 | .dmypy.json
125 | dmypy.json
126 |
127 | # Pyre type checker
128 | .pyre/
129 |
130 | # Pycharm
131 | .idea/
132 |
133 | # VS code
134 | .vscode/
135 | *.featureML
136 | *.dill
137 | docs/examples/exportedDataMatrix.tsv
138 | docs/examples/defined_spots_rtShifted.tsv
139 | docs/examples/defined_spots.tsv
140 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | sphinx:
4 | configuration: docs/conf.py
5 |
6 | build:
7 | os: ubuntu-20.04
8 | tools:
9 | python: "3.9"
10 |
11 | python:
12 | install:
13 | - requirements: docs/requirements.txt
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2020, Bioanalytical mass spectrometry group at CIBION-CONICET
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | include tidyms/chem/elements.json
4 | include tidyms/chem/isotopes.json
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # make file for pytest
2 |
3 | .PHONY: test-unit
4 | test-unit:
5 | pytest --cov=tidyms tests/unit
6 |
7 | .PHONY: test-all
8 | test-all:
9 | pytest --cov=tidyms
10 |
11 | .PHONY: coverage
12 | coverage:
13 | pytest --cov=tidyms && coverage html
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | TidyMS: Tools for working with MS data in metabolomics
2 | ======================================================
3 |
4 | TidyMS is a python library for processing Mass Spectrometry data. It aims to
5 | provide easy to use tools to read, process and visualize MS data generated in
6 | metabolomic studies.
7 |
8 | Features
9 | --------
10 |
11 | TidyMS provides functionality to:
12 |
13 | 1. Read raw MS data in the mzML format
14 | 2. Spectrum and chromatogram creation.
15 | 3. Powerful and flexible peak picking functions optimized for chromatographic
16 | and spectral data.
17 | 4. Feature detection and feature correspondence in LC-MS data.
18 | 5. Reading processed data in a variety of formats (XCMS, MZMine2, ...)
19 | 5. Data matrix curation using widely accepted guidelines from the metabolomics
20 | community.
21 | 6. Interactive visualizations of raw and processed data using Bokeh, or
22 | publication quality plots using seaborn.
23 |
24 | Installation
25 | ------------
26 |
27 | The latest release can be installed from PyPI:
28 |
29 | ```
30 | pip install tidyms
31 | ```
32 |
33 | Examples
34 | --------
35 |
36 | Jupyter notebooks with examples are available
37 | [here](https://github.com/griquelme/tidyms-notebooks).
38 |
39 | Tests
40 | -----
41 |
42 | TidyMS uses unit tests for most of its functionality.
43 | The tests can be executed with
44 | ```
45 | python setup.py test
46 | ```
47 |
48 | Documentation
49 | -------------
50 |
51 | The official documentation is available at
52 | [readthedocs](https://tidyms.readthedocs.io/en/latest/).
53 |
54 |
55 | Citation
56 | --------
57 |
58 | If you find TidyMS useful, we would appreciate citations:
59 |
60 | Riquelme, G.; Zabalegui, N.; Marchi, P.; Jones, C.M.; Monge, M.E. A Python-Based
61 | Pipeline for Preprocessing LC–MS Data for Untargeted Metabolomics Workflows.
62 | _Metabolites_ **2020**, 10, 416, doi:10.3390/metabo10100416.
63 |
64 |
--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/griquelme/tidyms/ad9356a099f367076f745406be23bb4c50003239/conftest.py
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/_templates/autosummary/base.rst:
--------------------------------------------------------------------------------
1 | .. raw:: html
2 |
3 |
4 |
5 |
6 | {{ fullname | escape | underline}}
7 |
8 | .. currentmodule:: {{ module }}
9 |
10 | .. auto{{ objtype }}:: {{ objname }}
11 |
12 |
--------------------------------------------------------------------------------
/docs/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
1 | .. raw:: html
2 |
3 |
4 |
5 |
6 | {{ fullname | escape | underline}}
7 |
8 | .. currentmodule:: {{ module }}
9 |
10 |
11 | .. autoclass:: {{ name }}
12 | :members:
13 |
14 |
--------------------------------------------------------------------------------
/docs/_templates/autosummary/module.rst:
--------------------------------------------------------------------------------
1 | {{ fullname }}
2 | {{ underline }}
3 |
4 | .. automodule:: {{fullname}}
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
1 | .. _api:
2 |
3 | .. py:currentmodule:: tidyms
4 |
5 | API reference
6 | =============
7 |
8 | Tools for working with raw data
9 | -------------------------------
10 |
11 | .. autosummary::
12 | :toctree: generated
13 |
14 | tidyms.Assay
15 | tidyms.MSData
16 | tidyms.Chromatogram
17 | tidyms.MSSpectrum
18 |
19 | Tools for working with processed data
20 | -------------------------------------
21 |
22 | .. autosummary::
23 | :toctree: generated
24 |
25 | tidyms.DataContainer
26 | tidyms.filter.Pipeline
27 |
28 | List of available filters and processors
29 | ----------------------------------------
30 |
31 | .. autosummary::
32 | :toctree: generated
33 |
34 | tidyms.filter.BatchCorrector
35 | tidyms.filter.BlankCorrector
36 | tidyms.filter.ClassRemover
37 | tidyms.filter.DilutionFilter
38 | tidyms.filter.DRatioFilter
39 | tidyms.filter.PrevalenceFilter
40 | tidyms.filter.VariationFilter
41 |
42 | Tools for working with chemical data
43 | ------------------------------------
44 |
45 | .. autosummary::
46 | :toctree: generated
47 |
48 | tidyms.chem.Formula
49 | tidyms.chem.PeriodicTable
50 | tidyms.chem.FormulaGenerator
51 | tidyms.chem.EnvelopeScorer
52 |
53 | Module reference
54 | ----------------
55 |
56 | .. autosummary::
57 | :toctree: generated
58 |
59 | tidyms.container
60 | tidyms.correspondence
61 | tidyms.fileio
62 | tidyms.filter
63 | tidyms.lcms
64 | tidyms.peaks
65 | tidyms.raw_data_utils
66 | tidyms.utils
67 | tidyms.chem.atoms
68 | tidyms.chem.envelope_tools
69 | tidyms.chem.formula
70 |
71 | tidyms.dartms
--------------------------------------------------------------------------------
/docs/bokeh_plots.py:
--------------------------------------------------------------------------------
1 | from bokeh import plotting
2 | import tidyms as ms
3 | import numpy as np
4 | from pathlib import Path
5 |
6 |
7 | seed = 1234
8 |
9 |
10 | def create_chromatogram() -> ms.Chromatogram:
11 |
12 | filename = "NZ_20200227_039.mzML"
13 | dataset = "test-nist-raw-data"
14 | ms.fileio.download_tidyms_data(dataset, [filename])
15 | path = Path(ms.fileio.get_tidyms_path())
16 | path = path.joinpath(dataset, filename)
17 |
18 | ms_data = ms.MSData.create_MSData_instance(
19 | path,
20 | ms_mode="centroid",
21 | instrument="qtof",
22 | separation="uplc"
23 | )
24 | mz_list = np.array([189.0734])
25 | return ms.make_chromatograms(ms_data, mz_list)[0]
26 |
27 |
28 | def plot_chromatogram():
29 | plotting.output_file("_static/chromatogram.html")
30 | chromatogram = create_chromatogram()
31 | p = chromatogram.plot(show=False)
32 | plotting.save(p)
33 |
34 |
35 | def plot_chromatogram_with_peaks():
36 | # generate always the same plot
37 | plotting.output_file("_static/chromatogram-with-peaks.html")
38 | chromatogram = create_chromatogram()
39 | chromatogram.extract_features()
40 | p = chromatogram.plot(show=False)
41 | plotting.save(p)
42 |
43 |
44 | def feature_plot():
45 | plotting.output_file("_static/feature-plot.html")
46 | data = ms.fileio.load_dataset("reference-materials")
47 | ignore = ["Z", "SV", "B", "SSS", "SCQC"]
48 | # search [M+H]+ from trp in the features
49 | mz = 205.097
50 | rt = 124
51 | # get a list of features compatible with the given m/z and rt
52 | ft_name = data.select_features(mz, rt)
53 |
54 | f = data.plot.feature(ft_name[0], draw=False, ignore_classes=ignore)
55 | plotting.save(f)
56 |
57 |
58 | def pca_plot():
59 | plotting.output_file("_static/pca-scores.html")
60 |
61 | data = ms.fileio.load_dataset("reference-materials")
62 | ignore = ["Z", "SV", "B", "SSS", "SCQC"]
63 | f = data.plot.pca_scores(fig_params={"height": 250},
64 | ignore_classes=ignore,
65 | scaling="autoscaling",
66 | draw=False)
67 | plotting.save(f)
68 |
69 |
70 | def create_assay(assay_path) -> ms.Assay:
71 | plotting.output_file("_static/pca-scores.html")
72 | ms.fileio.download_dataset("test-nist-raw-data")
73 | ms.fileio.download_dataset("reference-materials")
74 | tidyms_dir = Path(ms.utils.get_tidyms_path())
75 | data_path = tidyms_dir.joinpath("test-nist-raw-data")
76 | sample_metadata_path = data_path.joinpath("sample_list.csv")
77 |
78 | assay = ms.Assay(
79 | data_path=data_path,
80 | assay_path=assay_path,
81 | sample_metadata=sample_metadata_path,
82 | separation="uplc",
83 | instrument="qtof"
84 | )
85 | return assay
86 |
87 |
88 | def plot_roi_assay(assay: ms.Assay, save_path: str):
89 | plotting.output_file(save_path)
90 | sample_name = "NZ_20200227_039"
91 | p = assay.plot.roi(sample_name, show=False)
92 | plotting.save(p)
93 |
94 |
95 | def plot_stacked_chromatogram(assay: ms.Assay):
96 | plotting.output_file("_static/stacked-chromatograms.html")
97 | p = assay.plot.stacked_chromatogram(6, show=False)
98 | plotting.save(p)
99 |
100 |
101 | def create_assay_plots():
102 | assay_path = "_build/test-assay"
103 | assay = create_assay(assay_path)
104 | mz_list = np.array(
105 | [118.0654, 144.0810, 146.0605, 181.0720, 188.0706, 189.0738,
106 | 195.0875, 205.0969]
107 | )
108 | make_roi_params = {
109 | "tolerance": 0.015,
110 | "min_intensity": 5000,
111 | "targeted_mz": mz_list,
112 | }
113 | assay.detect_features(verbose=False, **make_roi_params)
114 | plot_roi_assay(assay, "_static/roi-no-peaks.html")
115 | assay.extract_features(store_smoothed=True, verbose=False)
116 | assay.describe_features(verbose=False)
117 | assay.build_feature_table()
118 | assay.match_features(verbose=False)
119 | plot_roi_assay(assay, "_static/roi-peaks.html")
120 | plot_stacked_chromatogram(assay)
121 |
122 |
123 | def create_plots():
124 | plot_chromatogram()
125 | plot_chromatogram_with_peaks()
126 | feature_plot()
127 | pca_plot()
128 | create_assay_plots()
--------------------------------------------------------------------------------
/docs/chem.rst:
--------------------------------------------------------------------------------
1 | .. _working-with-chemical-formulas:
2 |
3 | .. py:currentmodule:: tidyms
4 |
5 | Chemical data utilities
6 | =======================
7 |
8 | The `chem` module contains utilities to work with chemical data such as isotopes,
9 | elements and formulas. Also, it contain utilities to generate formulas from
10 | exact mass, score isotopic envelopes and search isotopic envelope candidates
11 | from a list of m/z values.
12 |
13 | Searching chemical data
14 | -----------------------
15 |
16 | :func:`~tidyms.chem.PeriodicTable` contains element and isotope information.
17 | The ``get_element`` method returns a :class:`~tidyms.chem.atom.Element`
18 |
19 | .. code-block:: python
20 |
21 | >>> import tidyms as ms
22 | >>> ptable = ms.chem.PeriodicTable()
23 | >>> oxygen = ptable.get_element("O")
24 | >>> oxygen
25 | Element(O)
26 |
27 | Element information can be retrieved easily:
28 |
29 | .. code-block:: python
30 |
31 | >>> oxygen.z
32 | 8
33 | >>> oxygen.symbol
34 | "O"
35 | >>> oxygen.isotopes
36 | {16: Isotope(16O), 17: Isotope(17O), 18: Isotope(18O)}
37 | >>> oxygen.get_monoisotope()
38 | Isotope(16O)
39 | >>> oxygen.get_abundances()
40 | (array([16, 17, 18]),
41 | array([15.99491462, 16.9991317 , 17.999161 ]),
42 | array([9.9757e-01, 3.8000e-04, 2.0500e-03]))
43 |
44 | :class:`~tidyms.chem.atom.Isotope` store exact mass, nominal mass and abundance
45 | of each isotope:
46 |
47 | .. code-block:: python
48 |
49 | >>> o16 = oxygen.get_monoisotope()
50 | >>> o16.m
51 | 15.99491462
52 | >>> o16.a
53 | 16
54 | >>> o16.p
55 | 0.99757
56 |
57 | Working with chemical formulas
58 | ------------------------------
59 |
60 | Chemical formulas can be created with the :class:`~tidyms.chem.Formula` object:
61 |
62 | .. code-block:: python
63 |
64 | >>> water = ms.chem.Formula("H2O")
65 | >>> water
66 | Formula(H2O)
67 |
68 | Formula objects can be used to compute a formula mass and its isotopic envelope:
69 |
70 | .. code-block:: python
71 |
72 | >>> water.get_exact_mass()
73 | 18.010564684
74 | >>> M, p = water.get_isotopic_envelope()
75 | >>> M
76 | array([18.01056468, 19.01555724, 20.01481138, 21.02108788])
77 | >>> p
78 | array([9.97340572e-01, 6.09327319e-04, 2.04962911e-03, 4.71450803e-07]))
79 |
80 | Formulas can be created by passing a dictionary of element or isotopes to a
81 | formula coefficient and the numerical charge of the formula. Formulas are
82 | implemented as dictionaries of isotopes to formula coefficients, so if an
83 | element is passed, it is assumed that it is the most abundant isotope.
84 |
85 | .. code-block:: python
86 |
87 | >>> f = ms.chem.Formula({"C": 1, "13C": 1, "O": 4}, 0)
88 | >>> f
89 | Formula(C(13C)O4)
90 |
91 | Isotopes can also be specified in the string format:
92 |
93 | .. code-block:: python
94 |
95 | >>> f = ms.chem.Formula("[C(13C)2H2O4]2-")
96 | Formula([C(13C)2H2O4]2-)
97 | >>> f.charge
98 | -2
99 |
100 |
101 | Sum formula generation
102 | ----------------------
103 |
104 | The :class:`~tidyms.chem.FormulaGenerator` generates sum formulas from a mass
105 | value. To generate formulas, the space of formula must be defined by using
106 | and passed to the formula generator constructor:
107 |
108 | .. code-block:: python
109 |
110 | >>> bounds = {"C": (0, 20), "H": (0, 40), "O": (0, 10), "N": (0, 5)}
111 | >>> formula_generator = ms.chem.FormulaGenerator(bounds)
112 |
113 | To generate formulas, an exact mass value must be passed, along with a tolerance
114 | to find compatible formulas.
115 |
116 | .. code-block:: python
117 |
118 | >>> f = ms.chem.Formula("C5H10O2")
119 | >>> M = f.get_exact_mass() # Mass value to generate formulas
120 | >>> tolerance = 0.005
121 | >>> formula_generator.generate_formulas(M, tolerance)
122 | >>> coefficients, isotopes, M_coeff = formula_generator.results_to_array()
123 | >>> coefficients
124 | array([[ 0, 10, 2, 4],
125 | [ 3, 8, 3, 1],
126 | [ 5, 10, 0, 2]])
127 | >>> isotopes
128 | [Isotope(12C), Isotope(1H), Isotope(14N), Isotope(16O)]
129 |
130 | Coefficients is a 2D Numpy array where each row are coefficients of valid
131 | formulas and each column is an isotope.
132 |
133 | Formula generator objects can be created easily by using the static method
134 | :meth:`~tidyms.chem.FormulaGenerator.from_hmdb`, which generates reasonable
135 | coefficients spaces for the CHNOPS elements by finding the maximum coefficients
136 | in compounds from the `Human Metabolome DataBase
`_:
137 |
138 | .. code-block:: python
139 |
140 | m = 1000
141 | formula_generator = ms.chem.FormulaGenerator.from_hmdb(m)
142 |
143 | ``m`` defines the maximum mass of the compounds included to create the coefficient
144 | space. ``m`` can take values of 500, 1000, 1500 and 2000. Other element can be
145 | added as follows =
146 |
147 | .. code-block:: python
148 |
149 | m = 1000
150 | bounds = {"Cl": (0, 2)
151 | formula_generator = ms.chem.FormulaGenerator.from_hmdb(m, bounds=bounds)
152 |
153 |
154 | Scoring Isotopic envelopes
155 | --------------------------
156 |
157 | Scoring measured envelopes against theoretical values is a common strategy
158 | to establish a formula candidate for an unknown compound. The
159 | :class:`~tidyms.chem.EnvelopeScorer` uses the formulas generated by a formula
160 | generator and scores them using a measure of similarity between the measured and
161 | theoretical envelopes:
162 |
163 | .. code-block:: python
164 |
165 | >>> bounds = {"C": (0, 20), "H": (0, 40), "O": (0, 10), "N": (0, 5)}
166 | >>> fg = ms.chem.FormulaGenerator(bounds)
167 | >>> envelope_scorer = ms.chem.EnvelopeScorer(fg, scorer="qtof", max_length=10)
168 |
169 | The `max_length` parameter sets the maximum length of the measured envelopes to
170 | compare against theoretical values. The `scorer` parameter can be ``qtof``,
171 | ``orbitrap`` or a callable that implements a custom scorer. In the first two
172 | cases, default parameters are set for values measured in Q-TOF or Orbitrap
173 | instruments. The score method takes a list of exact mass and abundances of an
174 | envelope and scores against all compatible formulas. See the API for a detailed
175 | description on how to customize the scorer function. The results can be obtained
176 | with the :meth:`tidyms.chem.EnvelopeScorer.get_top_results` method:
177 |
178 | .. code-block:: python
179 |
180 | >>> import numpy as np
181 | >>> f = ms.chem.Formula("C5H10O2")
182 | >>> M, p = f.get_isotopic_envelope(4) # Get first four peaks from the envelope
183 | >>> tolerance = 0.005
184 | >>> envelope_scorer.score(M, p, tolerance)
185 | >>> coefficients, isotopes, score = envelope_scorer.get_top_results()
186 | >>> coefficients[np.argmax(score)]
187 | array([ 5, 10, 0, 2])
188 |
189 |
190 |
191 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath(os.path.pardir))
16 | sys.path.insert(0, os.path.abspath(os.getcwd()))
17 | from bokeh_plots import create_plots
18 |
19 | # -- Project information -----------------------------------------------------
20 |
21 | project = 'TidyMS'
22 | copyright = "2020, Bioanalytical Mass Spectrometry Group at CIBION-CONICET"
23 | author = 'Gabriel Riquelme'
24 |
25 | # -- generate plot files -----------------------------------------------------
26 | if not os.path.isdir("_static"):
27 | os.mkdir("_static")
28 |
29 | if not os.path.isdir("_build"):
30 | os.mkdir("_build")
31 |
32 | create_plots()
33 |
34 | # -- General configuration ---------------------------------------------------
35 |
36 | # Add any Sphinx extension module names here, as strings. They can be
37 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
38 | # ones.
39 | extensions = [
40 | 'sphinx.ext.autodoc',
41 | 'sphinx.ext.mathjax',
42 | 'sphinx.ext.autosummary',
43 | 'sphinx.ext.intersphinx',
44 | 'IPython.sphinxext.ipython_directive',
45 | 'IPython.sphinxext.ipython_console_highlighting',
46 | 'bokeh.sphinxext.bokeh_plot',
47 | 'matplotlib.sphinxext.plot_directive',
48 | 'numpydoc'
49 | ]
50 |
51 | add_module_names = False
52 | # Generate the API documentation when building
53 | autosummary_generate = True
54 | numpydoc_show_class_members = False
55 |
56 | # Add any paths that contain templates here, relative to this directory.
57 | templates_path = ['_templates']
58 |
59 | # List of patterns, relative to source directory, that match files and
60 | # directories to ignore when looking for source files.
61 | # This pattern also affects html_static_path and html_extra_path.
62 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
63 |
64 |
65 | # -- Options for HTML output -------------------------------------------------
66 |
67 | # The theme to use for HTML and HTML Help pages. See the documentation for
68 | # a list of builtin themes.
69 | #
70 | html_theme = 'sphinx_rtd_theme'
71 |
72 | # Add any paths that contain custom static files (such as style sheets) here,
73 | # relative to this directory. They are copied after the builtin static files,
74 | # so a file named "default.css" will overwrite the builtin "default.css".
75 | html_static_path = ['_static']
76 |
77 | intersphinx_mapping = {
78 | 'pandas': ('https://pandas.pydata.org/docs/', None),
79 | 'scipy': ('https://docs.scipy.org/doc/scipy/', None),
80 | }
81 |
82 | # set index.rst as the master doc
83 | master_doc = 'index'
84 |
85 | # include __init__ in docs
86 | autoclass_content = 'both'
--------------------------------------------------------------------------------
/docs/descriptors.csv:
--------------------------------------------------------------------------------
1 | Descriptor,Meaning
2 | height,height relative to the baseline
3 | area,area minus the baseline area
4 | rt,weighted average of the retention time in the peak region
5 | mz,weighted average of the m/z in the peak region
6 | width,"width, computed as the region where the 95 % of the peak area is distributed"
7 | snr,"peak signal-to-noise ratio, defined as the quotient between the peak height and the noise level"
8 | mz std,standard deviation of the m/z in the peak region
--------------------------------------------------------------------------------
/docs/examples/custom_peak_descriptors.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from tidyms.peaks import detect_peaks
3 | from tidyms.peaks import get_peak_descriptors
4 | from tidyms.utils import gaussian_mixture
5 |
6 | # always generate the same plot
7 | np.random.seed(1234)
8 |
9 | # create a signal with two gaussian peaks
10 | x = np.arange(100)
11 | gaussian_params = np.array([[25, 3, 30], [50, 2, 60]])
12 | y = gaussian_mixture(x, gaussian_params).sum(axis=0)
13 | # add a noise term
14 | y += np.random.normal(size=y.size, scale=1)
15 |
16 | # detect_peaks also returns the noise and baseline estimation used
17 | peaks, noise, baseline = detect_peaks(y)
--------------------------------------------------------------------------------
/docs/examples/defined_spots_supervised.tsv:
--------------------------------------------------------------------------------
1 | msData_ID spotInd include name group class batch startRT_seconds endRT_seconds comment
2 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 0 False Spot_0 unknown unknown 1 10.15000016 308.5039902 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
3 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 1 True Airblank_1 Airblank unknown 1 326.7699909 367.3630142 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
4 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 2 True Airblank_2 Airblank unknown 1 376.4960003 419.1180038 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
5 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 3 True Airblank_3 Airblank unknown 1 428.2509899 469.8579884 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
6 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 4 True Airblank_4 Airblank unknown 1 478.9920044 521.6139793 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
7 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 5 True Airblank_5 Airblank unknown 1 529.7320175 572.3539925 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
8 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 6 True Airblank_6 Airblank unknown 1 581.4870071 623.0949783 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
9 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 7 True Airblank_7 Airblank unknown 1 632.227993 674.8500252 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
10 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 8 True Airblank_8 Airblank unknown 1 683.9829826 725.5899811 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
11 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 9 True Airblank_9 Airblank unknown 1 734.7229958 776.3310242 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
12 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 10 True Airblank_10 Airblank unknown 1 785.4639816 828.0860138 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
13 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 11 True Airblank_11 Airblank unknown 1 837.2190285 878.8260269 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
14 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 12 True Airblank_12 Airblank unknown 1 887.9600143 923.4780121 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
15 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 0 False Spot_0 Inj2ul unknown 2 37.54999995 229.3489981 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
16 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 1 True Inj10ul_1 Inj10ul unknown 2 247.6150131 288.2070065 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
17 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 2 True Inj10ul_2 Inj10ul unknown 2 297.3409939 339.9629974 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
18 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 3 True Inj10ul_3 Inj10ul unknown 2 348.081007 390.7030106 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
19 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 4 True Inj10ul_4 Inj10ul unknown 2 399.8359966 441.4439964 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
20 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 5 True Inj10ul_5 Inj10ul unknown 2 450.5770111 493.1989861 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
21 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 6 True Inj10ul_6 Inj10ul unknown 2 501.3170242 543.9389992 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
22 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 7 True Inj2ul_1 Inj2ul unknown 2 553.0729866 595.6950188 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
23 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 8 True Inj2ul_2 Inj2ul unknown 2 603.8129997 646.4349747 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
24 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 9 True Inj2ul_3 Inj2ul unknown 2 655.5690193 698.1909943 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
25 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 10 True Inj2ul_4 Inj2ul unknown 2 706.3089752 748.9310074 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
26 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 11 True Inj2ul_5 Inj2ul unknown 2 757.0500183 799.6719933 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
27 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 12 True Inj2ul_6 Inj2ul unknown 2 808.8050079 844.3230057 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
28 |
--------------------------------------------------------------------------------
/docs/examples/roi-creation.py:
--------------------------------------------------------------------------------
1 | from ftplib import FTP
2 | import tidyms as ms
3 | import os
4 |
5 | # this code downloads an example file from Metabolights via ftp
6 | study_path = "pub/databases/metabolights/studies/public/MTBLS1919"
7 | sample_path = os.path.join(study_path, "Applications/Centroid_data")
8 | filename = "NZ_20200227_041.mzML"
9 | ftp = FTP("ftp.ebi.ac.uk")
10 | ftp.login()
11 | ftp.cwd(sample_path)
12 | with open(filename, "wb") as fin:
13 | ftp.retrbinary("RETR " + filename, fin.write)
14 | ftp.close()
15 |
16 | # specifying instrument and separation used in the experiments provides better
17 | # default values for several functions used in
18 | ms_data = ms.MSData.create_MSData_instance(
19 | filename,
20 | ms_mode="centroid",
21 | instrument="qtof",
22 | separation="uplc"
23 | )
24 | roi_list = ms_data.make_roi()
25 |
--------------------------------------------------------------------------------
/docs/fileio.rst:
--------------------------------------------------------------------------------
1 | .. _working-with-raw-data:
2 |
3 | .. py:currentmodule:: tidyms
4 |
5 | :orphan:
6 |
7 | Working with raw data
8 | =====================
9 |
10 | TidyMS works with raw data in the mzML format using the :class:`~tidyms.MSData`
11 | class. In this section we show commons operations on raw data. For file
12 | conversion to the mzML format see :ref:`this guide `
13 |
14 | For the examples we will use an example mzML file that can be downloaded with
15 | the following code:
16 |
17 | .. code-block:: python
18 |
19 | import numpy as np
20 | import tidyms as ms
21 |
22 | filename = "NZ_20200227_039.mzML"
23 | dataset = "test-nist-raw-data"
24 | ms.fileio.download_tidyms_data(dataset, [filename], download_dir=".")
25 |
26 |
27 | Raw data
28 | --------
29 |
30 | Raw MS data in the mzML format can be read through the :class:`~tidyms.MSData`
31 | object.
32 |
33 | .. code-block:: python
34 |
35 | ms_data = ms.MSData.create_MSData_instance(
36 | filename,
37 | ms_mode="centroid",
38 | instrument="qtof",
39 | separation="uplc"
40 | )
41 |
42 | It is necessary to specify if the data is in centroid or profile mode using the
43 | :code:`ms_mode` parameter, as some methods work in different ways for each
44 | type of data. Specifying the :code:`instrument` and :code:`separation` is also
45 | recommended, as these parameters set reasonable defaults in different functions
46 | used.
47 |
48 | :class:`~tidyms.MSData` is optimized for low memory usage and only loads the
49 | required data into memory. A single MS spectrum can be loaded using
50 | :meth:`~tidyms.MSData.get_spectrum` which returns a
51 | :class:`~tidyms.lcms.MSSpectrum`.
52 |
53 | .. code-block:: python
54 |
55 | index = 20
56 | sp = ms_data.get_spectrum(index)
57 |
58 | The index used is the order in which the data was stored in the file. In the
59 | same way, a stored chromatogram can be retrieved using
60 | :meth:`~tidyms.MSData.get_chromatogram`. The total count of spectra and
61 | chromatograms in the file can be obtained using
62 | :meth:`tidyms.MSData.get_n_spectra` and
63 | :meth:`tidyms.MSData.get_n_chromatograms` respectively. Iterating over all
64 | the spectra in a file can be done using
65 | :meth:`~tidyms.MSData.get_spectra_iterator`, which generates each one of the
66 | spectra in the file and allows filtering by acquisition time or MS level.
67 | Common operations with raw data are located in :mod:`tidyms.raw_data_utils`.
68 |
69 |
70 | Working with Mass Spectra
71 | -------------------------
72 |
73 | :class:`~tidyms.MSSpectrum` stores the information from one scan. It is mostly
74 | used as a data storage class in several data processing steps, but it also has
75 | functionality to visualize the spectrum using the
76 | :meth:`~tidyms.MSSpectrum.plot` method and to convert a profile data spectrum
77 | into centroid mode using :meth:`tidyms.MSSpectrum.find_centroids`.
78 |
79 | :func:`tidyms.raw_data_utils.accumulate_spectra` combines a series of scans in
80 | a file into a single spectrum:
81 |
82 | .. code-block:: python
83 |
84 | combined_sp = ms.accumulate_spectra(ms_data, start_time=110, end_time=115)
85 |
86 | Chromatograms
87 | -------------
88 |
89 | Besides the chromatograms stored in a file, extracted chromatograms can be
90 | created :func:`tidyms.raw_data_utils.make_chromatograms` which takes an array of
91 | m/z and returns a list :class:`tidyms.Chromatogram` objects, each one associated
92 | to one of the m/z values provided:
93 |
94 | .. code-block:: python
95 |
96 | mz_list = np.array([189.0734, 205.0967, 188.071])
97 | chromatograms = ms.make_chromatograms(ms_data, mz_list)
98 |
99 | A chromatogram can be visualized using ``plot`` method:
100 |
101 | .. code-block:: python
102 |
103 | chrom = chromatograms[0]
104 | chrom.plot()
105 |
106 | .. raw:: html
107 |
108 |
109 |
110 | Peaks in a chromatogram are detected using
111 | :meth:`tidyms.lcms.LCRoi.extract_features`, which stores a list of
112 | :class:`tidyms.lcms.Peak` objects in the `features` attribute of the
113 | chromatogram. Plotting again the chromatogram shows the detected peaks:
114 |
115 | .. code-block:: python
116 |
117 | chrom.extract_features()
118 | chrom.plot()
119 |
120 | .. raw:: html
121 |
122 |
123 |
124 | Peak descriptors can be obtained using
125 | :meth:`tidyms.lcms.Roi.describe_features`:
126 |
127 | .. code-block:: python
128 |
129 | >>> chrom.describe_features()
130 | [{'height': 16572.38, 'area': 108529.94, 'rt': 125.73, 'width': 14.06,
131 | 'snr': 385.44, 'mz': None, 'mz_std': None}]
132 |
133 | A detailed description of the algorithm used for peak picking can be found
134 | :ref:`here `. These methods are also used to create a data matrix from
135 | a dataset. See :ref:`here ` a tutorial on how to work with
136 | complete datasets to extract a data matrix.
137 |
--------------------------------------------------------------------------------
/docs/fileio_tutorial.rst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/griquelme/tidyms/ad9356a099f367076f745406be23bb4c50003239/docs/fileio_tutorial.rst
--------------------------------------------------------------------------------
/docs/glossary.rst:
--------------------------------------------------------------------------------
1 | .. definitions
2 |
3 | Definitions
4 | ===========
5 |
6 | Here is a list of the concepts used in TidyMS.
7 |
8 | .. glossary::
9 |
10 | batch correction
11 | A correction step applied to reduce the time dependent variation in the
12 | metabolite signals due to instrumental response changes, carryover,
13 | or metabolite degradation, among others.
14 |
15 | blank correction
16 | A correction applied on study samples to remove the contribution to
17 | the signal coming from sample preparation. This process consist in
18 | measuring a set of blank samples and using them to estimate the
19 | sample preparation contribution to the signal.
20 |
21 | carryover
22 | A measurement artifact in LC-MS. Occurs when signals from one sample are
23 | detected in the next sample (signals are “carried over”).
24 |
25 | correction
26 | A data curation step where the data matrix is transformed to correct
27 | the data.
28 |
29 | data curation
30 | The process of reducing the bias introduced in the measurements during
31 | sample preparation and data acquisition. Also, the filtration of samples
32 | that cannot be measured in an analytically robust way.
33 |
34 | data matrix
35 | A matrix of feature values where each row is a sample or observation and
36 | each column is a feature.
37 |
38 | feature
39 | A measurable property of a phenomenon being observed. In LC-MS a feature
40 | is usually represented as a chromatographic peak.
41 |
42 | feature correspondence
43 | The process of match features extracted in different samples.
44 |
45 | feature descriptor
46 | A series of characteristics of a feature. In the case of a
47 | chromatographic peak, feature descriptors can be peak area, retention
48 | time, mean m/z among, others.
49 |
50 | feature detection
51 | The process of finding a feature in a data set. Once a feature is
52 | detected it can be extracted into a feature descriptor. In LC-MS the
53 | feature detection procedure involves the detection of chromatographic
54 | peaks and extraction into rt, m/z and area information.
55 |
56 | feature table
57 | The table obtained after feature extraction, where each row is a
58 | feature detected in a sample and each column is a descriptor.
59 |
60 | filtration
61 | A data curation step where samples or features are removed according
62 | to an specific criteria.
63 |
64 | mapping
65 | A dictionary that maps the sample type to sample classes The available
66 | sample types are: study sample, quality control, blank, system
67 | suitability.
68 |
69 | normalization
70 | An operation on the data matrix to adjust the sample values. Common
71 | normalization methods use different norms, such as the euclidean
72 | norm, Manhattan norm or maximum norm.
73 |
74 | prevalence filter
75 | A filter applied on a data matrix to remove features that are detected
76 | in a low number of samples.
77 |
78 | quality control sample
79 | Samples applied to demonstrate analytical accuracy, precision, and
80 | repeatability after data processing and can be converted to metrics
81 | describing data quality.
82 |
83 | run order
84 | Temporal order in which the different samples were analyzed.
85 |
86 | sample class
87 | The category of the sample. Can be related to the study (e.g: healthy,
88 | disease) or to the experiment design (quality control, blank, etc...).
89 |
90 | sample descriptor
91 | A characteristic of a sample. Can be the sample type, class, run order,
92 | analytical batch.
93 |
94 | sample type
95 | The type of sample used in the experiment. Sample types can be: study
96 | sample, quality control, blank, system suitability.
97 |
98 | scaling
99 | An operation on the data matrix to change the distribution of features.
100 |
101 | system suitability check
102 | The analysis of a series of samples to assess the performance of an
103 | analytical platform.
104 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. TidyMS documentation master file, created by
2 | sphinx-quickstart on Tue May 19 15:53:07 2020.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | TidyMS
7 | ======
8 |
9 | TidyMS is a python package that provides easy to use tools for processing and
10 | analyzing mass spectrometry based metabolomics data sets. It's built on top
11 | of Numpy, Pandas and scikit-learn. Get started by reading the
12 | :doc:`installation` instructions and then see an overview of the package in the
13 | :doc:`quickstart`. You can also see some applications in the example gallery. For
14 | detailed information about tidyms, you can see the :doc:`api` reference.
15 |
16 | .. toctree::
17 | :maxdepth: 2
18 | :caption: Contents:
19 |
20 | Glossary
21 | Installation guide
22 | Quickstart
23 | Tutorials
24 | API Reference
25 |
26 | Indices and tables
27 | ==================
28 |
29 | * :ref:`genindex`
30 | * :ref:`modindex`
31 | * :ref:`search`
32 |
--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
1 | .. installation
2 |
3 | Installation
4 | ============
5 |
6 | Python version
7 | --------------
8 |
9 | We recommend to use the last version of Python 3. If you don't have Python
10 | installed we recommend installing it using the `Anaconda`_ distribution .
11 |
12 | .. _Anaconda: https://docs.anaconda.com/anaconda/install/
13 |
14 |
15 | Install TidyMS
16 | --------------
17 |
18 | If you already have Python, you can install TidyMS from the Python Package
19 | Index:
20 |
21 | On Linux:
22 |
23 | .. code-block:: sh
24 |
25 | $ pip install tidyms
26 |
27 | On Windows, if you are using Anaconda and didn't add Python to the PATH
28 | environment variable you have to run this command from the conda prompt.
29 |
30 |
--------------------------------------------------------------------------------
/docs/mzml.rst:
--------------------------------------------------------------------------------
1 | .. _mzml:
2 |
3 | .. py:currentmodule:: tidyms
4 |
5 | Converting raw data to mzML format
6 | ==================================
7 |
8 | We recommend using `msconvert
9 | `_ to convert raw data
10 | generated from the different instruments to mzML format. Files can be converted
11 | from a GUI or from the command line. To convert all the files with names ending
12 | in :code:`.RAW` inside a directory from the command line the following command
13 | can be used:
14 |
15 | .. code-block:: bat
16 |
17 | msconvert *.RAW -o my_output_dir
18 |
19 | If you are using a Waters instrument with lockspray correction, the
20 | :code:`scanEvent` filter can be used to remove the signal from the lockspray.
21 |
22 | .. code-block:: bat
23 |
24 | msconvert *.RAW --filter "msEvent 1" -o my_output_dir
25 |
26 | To perform feature detection, data must be provided in centroid format. This
27 | can be done using the :code:`peakPicking` filter option:
28 |
29 | .. code-block:: bat
30 |
31 | msconvert data.RAW --filter "peakPicking cwt snr=1 peakSpace=0.01"
32 |
33 | A :code:`snr=1` is recommended as noisy peaks will be removed during feature
34 | detection anyway. :code:`peakSpacing` should be chosen according to the
35 | instrument used. For QTOF instruments a value of 0.01 is recommended, but
36 | for higher resolution instruments, such as orbitrap or FT-ICR, lower values
37 | may be used.
38 |
--------------------------------------------------------------------------------
/docs/plots/dbscan-clustering.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tidyms as ms
3 | import matplotlib.pyplot as plt
4 |
5 | np.random.seed(1234)
6 | n = 200
7 | X1 = np.random.normal(size=(n, 2))
8 | samples = np.hstack((np.arange(n), np.arange(n)))
9 | X2 = np.random.normal(size=(n, 2), loc=(2, 2))
10 | X = np.vstack((X1, X2))
11 |
12 | dbscan_labels = ms.correspondence._cluster_dbscan(X, 2.0, 50, 10000)
13 | gmm_labels, score = ms.correspondence._process_cluster(X, samples, 2, 3.0)
14 |
15 | fig, ax = plt.subplots()
16 | for l in np.unique(dbscan_labels):
17 | ax.scatter(*X[dbscan_labels == l].T, label=l)
18 |
19 | ax.set_xlabel("m/z")
20 | ax.set_ylabel("Rt")
21 | ax.legend(title="DBSCAN labels")
--------------------------------------------------------------------------------
/docs/plots/dbscan-parameters.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.cluster import DBSCAN
4 | import seaborn as sns
5 | from itertools import product
6 |
7 | sns.set_context("paper", font_scale=1.25)
8 |
9 |
10 | sample_size = [10, 20, 50, 100, 200, 500]
11 | fractions = [0.1, 0.25, 0.5, 0.75, 1.0]
12 | eps = [0.5, 1.0, 2.0, 3.0, 4.0]
13 | n_reps = 5
14 | results = list()
15 |
16 | for k_rep, size, f, e in product(range(n_reps), sample_size, fractions, eps):
17 | X = np.random.normal(size=(size, 2))
18 | min_samples = round(size * f)
19 | dbscan = DBSCAN(eps=e, min_samples=min_samples, metric="chebyshev")
20 | dbscan.fit(X)
21 | cluster = dbscan.labels_
22 | noise_fraction = (cluster == -1).sum() / size
23 | results.append([k_rep, size, f, e, noise_fraction])
24 | df_normal = pd.DataFrame(
25 | data=results,
26 | columns=["rep", "sample size", "sample fraction", "eps", "noise fraction"]
27 | )
28 |
29 | sns.catplot(
30 | data=df_normal,
31 | x="eps",
32 | y="noise fraction",
33 | palette="Set1",
34 | col="sample size",
35 | hue="sample fraction",
36 | legend="full",
37 | col_wrap=2,
38 | s=8
39 | )
--------------------------------------------------------------------------------
/docs/plots/gmm-clustering.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tidyms as ms
3 | import matplotlib.pyplot as plt
4 |
5 | np.random.seed(1234)
6 | n = 200
7 | X1 = np.random.normal(size=(n, 2))
8 | samples = np.hstack((np.arange(n), np.arange(n)))
9 | X2 = np.random.normal(size=(n, 2), loc=(2, 2))
10 | X = np.vstack((X1, X2))
11 |
12 | dbscan_labels = ms.correspondence._cluster_dbscan(X, 2.0, 50, 10000)
13 | gmm_labels, score = ms.correspondence._process_cluster(X, samples, 2, 3.0)
14 |
15 | fig, ax = plt.subplots()
16 | for l in np.unique(gmm_labels):
17 | ax.scatter(*X[gmm_labels == l].T, label=l)
18 |
19 | ax.set_xlabel("m/z")
20 | ax.set_ylabel("Rt")
21 | ax.legend(title="GMM labels")
--------------------------------------------------------------------------------
/docs/plots/peak-definition.py:
--------------------------------------------------------------------------------
1 | import tidyms as ms
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 | # always generate the same plot
6 | np.random.seed(1234)
7 |
8 | grid = np.arange(50)
9 | signal = ms.utils.gauss(grid, 25, 2, 30)
10 | noise = np.random.normal(size=signal.size, scale=1)
11 | x = signal + noise + 3
12 | peak = ms.lcms.Peak(19, 25, 30)
13 | fig, ax = plt.subplots(figsize=(6, 6))
14 | ax.plot(grid, x, label="signal")
15 | ax.scatter(grid[peak.start], x[peak.start], label="peak start", s=50)
16 | ax.scatter(grid[peak.apex], x[peak.apex], label="peak apex", s=50)
17 | ax.scatter(grid[peak.end], x[peak.end], label="peak end", s=50)
18 | ax.fill_between(grid[peak.start:peak.end + 1],
19 | x[peak.start:peak.end + 1], alpha=0.2, label="peak region")
20 | ax.annotate(text='', xy=(grid[peak.end + 5], x[peak.end]),
21 | xytext=(grid[peak.end + 5], x[peak.apex]),
22 | arrowprops=dict(arrowstyle='<->'))
23 | ax.annotate(text='peak \n prominence',
24 | xy=(grid[peak.end + 10],x[peak.apex] / 2))
25 | ax.legend()
26 |
--------------------------------------------------------------------------------
/docs/plots/peak-detection-example.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from tidyms import peaks
4 | from tidyms.lcms import Peak
5 | from tidyms.utils import gaussian_mixture
6 |
7 | # always generate the same plot
8 | np.random.seed(1234)
9 |
10 | # create a signal with two gaussian peaks
11 | x = np.arange(100)
12 | gaussian_params = np.array([[25, 3, 30], [50, 2, 60]])
13 | y = gaussian_mixture(x, gaussian_params).sum(axis=0)
14 | # add a noise term
15 | y += np.random.normal(size=y.size, scale=0.5)
16 |
17 | noise_estimation = peaks.estimate_noise(y)
18 | baseline_estimation = peaks.estimate_baseline(y, noise_estimation)
19 | start, apex, end = peaks.detect_peaks(y, noise_estimation, baseline_estimation)
20 | peaks = [Peak(s, a, p) for s, a, p in zip(start, apex, end)]
21 | fig, ax = plt.subplots()
22 | ax.plot(x, y)
23 | for p in peaks:
24 | ax.fill_between(x[p.start:p.end], y[p.start:p.end], alpha=0.25)
25 |
--------------------------------------------------------------------------------
/docs/plots/peak_detection_baseline_example.py:
--------------------------------------------------------------------------------
1 | import tidyms as ms
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 | np.random.seed(1234)
6 | signal_height = 100
7 | snr = 10
8 | n_col = 4
9 | x = np.arange(200)
10 | noise_level = signal_height / snr
11 | noise = np.random.normal(size=x.size, scale=noise_level)
12 | fig, ax = plt.subplots(
13 | nrows=3, ncols=n_col, figsize=(12, 12), sharex=True, sharey=True)
14 |
15 | # first row: one peak, different baselines
16 | row = 0
17 | baselines = [4, ms.utils.gauss(x, 100, 40, 20), x ** 2 * 0.002,
18 | np.sin(x * np.pi / 400) * 50]
19 | for col in range(n_col):
20 | signal = ms.utils.gauss(x, 100, 3, signal_height)
21 | y = signal + noise
22 | noise_estimation = ms.peaks.estimate_noise(y)
23 | ys = ms.lcms.gaussian_filter1d(y, 1)
24 | baseline_estimation = ms.peaks.estimate_baseline(ys, noise_estimation)
25 | start, apex, end = ms.peaks.detect_peaks(
26 | ys, noise_estimation, baseline_estimation)
27 | peaks = [ms.lcms.Peak(s, a, p) for (s, a, p) in zip(start, apex, end)]
28 | ax[row, col].plot(x, y)
29 | ax[row, col].plot(x, baseline_estimation)
30 | for p in peaks:
31 | ax[row, col].fill_between(x[p.start:p.end + 1],
32 | baseline_estimation[p.start:p.end + 1],
33 | y[p.start:p.end + 1], alpha=0.25)
34 |
35 | # second row: two peaks, same baselines as first row
36 | row = 1
37 | for col in range(n_col):
38 | gaussian_params = np.array([[100, 3, signal_height],
39 | [110, 3, signal_height]])
40 | signal = ms.utils.gaussian_mixture(x, gaussian_params).sum(axis=0)
41 | y = signal + baselines[col] + noise
42 | noise_estimation = ms.peaks.estimate_noise(y)
43 | ys = ms.lcms.gaussian_filter1d(y, 1)
44 | baseline_estimation = ms.peaks.estimate_baseline(ys, noise_estimation)
45 | start, apex, end = ms.peaks.detect_peaks(
46 | ys, noise_estimation, baseline_estimation)
47 | peaks = [ms.lcms.Peak(s, a, p) for (s, a, p) in zip(start, apex, end)]
48 | ax[row, col].plot(x, y)
49 | ax[row, col].plot(x, baseline_estimation)
50 | for p in peaks:
51 | ax[row, col].fill_between(x[p.start:p.end + 1],
52 | baseline_estimation[p.start:p.end + 1],
53 | y[p.start:p.end + 1], alpha=0.25)
54 |
55 | # third row: different peak widths:
56 | row = 2
57 | widths = [3, 5, 7, 10]
58 | for col in range(n_col):
59 | w = widths[col]
60 | signal = ms.utils.gauss(x, 100, w, signal_height)
61 | y = signal + baselines[0] + noise
62 | noise_estimation = ms.peaks.estimate_noise(y)
63 | ys = ms.lcms.gaussian_filter1d(y, 1)
64 | baseline_estimation = ms.peaks.estimate_baseline(ys, noise_estimation)
65 | start, apex, end = ms.peaks.detect_peaks(
66 | ys, noise_estimation, baseline_estimation)
67 | peaks = [ms.lcms.Peak(s, a, p) for (s, a, p) in zip(start, apex, end)]
68 | ax[row, col].plot(x, y)
69 | ax[row, col].plot(x, baseline_estimation)
70 | for p in peaks:
71 | ax[row, col].fill_between(x[p.start:p.end + 1],
72 | baseline_estimation[p.start:p.end + 1],
73 | y[p.start:p.end + 1], alpha=0.25)
74 |
--------------------------------------------------------------------------------
/docs/plots/roi-definition.py:
--------------------------------------------------------------------------------
1 | import tidyms as ms
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 | # always generate the same plot
6 | np.random.seed(1234)
7 | grid = np.arange(50)
8 | signal = ms.utils.gauss(grid, 25, 2, 30)
9 | noise = np.random.normal(size=signal.size, scale=1)
10 | x = signal + noise + 3
11 | mz_mean = 203.08215
12 | mz = np.random.normal(size=signal.size, scale=0.0005) + mz_mean
13 |
14 | fig, ax = plt.subplots(figsize=(6, 6), nrows=2, sharex=True)
15 | ax[1].plot(grid, x)
16 | ax[1].set_ylabel("Intensity")
17 | ax[1].set_xlabel("Retention Time")
18 | ax[0].plot(grid, mz)
19 | ax[0].set_ylabel("m/z")
20 | ax[0].set_ylim(mz_mean - 0.0025, mz_mean + 0.0025)
21 |
--------------------------------------------------------------------------------
/docs/preprocessing-steps.csv:
--------------------------------------------------------------------------------
1 | #,name,description
2 | 1,Feature Detection,"Regions of interest (ROI) are detected in each sample."
3 | 2,Feature Extraction,"Features are extracted from each ROI."
4 | 3,Feature description,"A table of feature descriptors is built for each sample."
5 | 4,Feature table construction,"A feature table for all samples is built"
6 | 5,Feature matching,"Features found in different samples are grouped if they have a common identity."
7 | 6,Data matrix creation,"The data matrix is created using the feature table."
--------------------------------------------------------------------------------
/docs/quickstart.rst:
--------------------------------------------------------------------------------
1 | .. _quickstart:
2 |
3 | .. py:currentmodule:: tidyms
4 |
5 | Quickstart
6 | ==========
7 |
8 | TidyMS [1]_ is a Python package that provides tools to process and analyze
9 | Mass Spectrometry (MS) data. Although suited for general use, it was designed
10 | to be used with datasets from LC-HRMS metabolomics experiments. It uses
11 | `Numpy `_, `Pandas `_ and
12 | `scikit-learn `_ for data processing and analysis.
13 | Some of the functionality that offers is:
14 |
15 | - read raw data in the mzML format using :class:`tidyms.MSData` class, optimized for speed and low memory usage.
16 | - Creation of chromatograms and accumulated spectra from raw data.
17 | - :term:`Feature detection` and :term:`feature correspondence` in metabolomics datasets using the :class:`tidyms.Assay` class.
18 | - Read processed data from other mass spectrometry processing software (XCMS, mzmine2, etc...).
19 | - A container object to manage metabolomics data.
20 | - :term:`Data curation` of untargeted metabolomics data sets using widely accepted practices from the metabolomics community [2]_ [3]_
21 | - Interactive data visualization using `bokeh `_, or publication quality plots using `seaborn `_.
22 |
23 | In the rest of this guide, you can find links for different use cases for the
24 | TidyMS package. A basic knowledge of MS and metabolomics is assumed, but you can
25 | look up in the :doc:`glossary` the concepts used in the guides.
26 | Installation instructions are available :doc:`here`.
27 |
28 | You can refer to the following guides to learn about specific topics:
29 |
30 | - :ref:`Working with raw data `
31 | - :ref:`Processing complete datasets from raw data `
32 | - :ref:`Curation of a metabolomics data matrix `
33 | - :ref:`Feature detection and extraction algorithms `
34 | - :ref:`Feature correspondence algorithm `
35 | - :ref:`Converting proprietary instrument-specific formats into mzML `
36 |
37 |
38 | References
39 | ----------
40 |
41 | .. [1] Riquelme, G. *et al*, "A Python-Based Pipeline for Preprocessing LC–MS
42 | Data for Untargeted Metabolomics Workflows". Metabolites 2020, 10, 416.
43 | https://doi.org/10.3390/metabo10100416
44 | 16, 1, (2015), Pages 104–117, https://doi.org/10.1093/bib/bbt080
45 | .. [2] W B Dunn *et al*, "Procedures for large-scale metabolic profiling of
46 | serum and plasma using gas chromatography and liquid chromatography
47 | coupled to mass spectrometry", Nature Protocols volume 6, pages
48 | 1060–1083 (2011).
49 | .. [3] D Broadhurst *et al*, "Guidelines and considerations for the use of
50 | system suitability and quality control samples in mass spectrometry assays
51 | applied in untargeted clinical metabolomic studies.", Metabolomics,
52 | 2018;14(6):72. doi: 10.1007/s11306-018-1367-3
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | bokeh
2 | numpydoc
3 | sphinx
4 | tidyms
5 | sphinx_rtd_theme
--------------------------------------------------------------------------------
/docs/tutorials.rst:
--------------------------------------------------------------------------------
1 | .. _tutorials:
2 |
3 | .. py:currentmodule:: tidyms
4 |
5 | Tutorials
6 | =========
7 |
8 | In this section there is available a list of tutorials on specific topics.
9 |
10 | * :ref:`Converting files to mzML `
11 | * :ref:`Working with raw data `
12 | * :ref:`Processing complete datasets `
13 | * :ref:`Feature detection `
14 | * :ref:`Feature correspondence `
15 | * :ref:`Working with chemical formulas `
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["hatchling"]
3 | build-backend = "hatchling.build"
4 |
5 | [project]
6 | name = "tidyms"
7 | version = "0.7.0"
8 | description = "Tools for working with MS data in metabolomics"
9 | authors = [
10 | { name = "Gabriel Riquelme" },
11 | ]
12 | readme = "README.md"
13 | license = {file = "LICENSE"}
14 | classifiers = [
15 | "Programming Language :: Python :: 3.9",
16 | "Programming Language :: Python :: 3.10",
17 | "License :: OSI Approved :: BSD License",
18 | "Topic :: Scientific/Engineering :: Bio-Informatics",
19 | "Topic :: Scientific/Engineering :: Chemistry",
20 | "Topic :: Scientific/Engineering :: Medical Science Apps."
21 | ]
22 | dependencies = [
23 | "beautifulsoup4>=4.11.2",
24 | "bokeh>=3.0",
25 | "Cerberus>=1.3",
26 | "dill>=0.3.6",
27 | "ipython>=8.1",
28 | "joblib>=1.1",
29 | "matplotlib>=3.5.1",
30 | "natsort>=8.2.0",
31 | "networkx>=3.0",
32 | "numpy>=1.22",
33 | "openpyxl>=3.0",
34 | "pandas>=1.5.3",
35 | "plotnine>=0.10.1",
36 | "requests",
37 | "scikit-learn>=1.0.2",
38 | "scipy>=1.8",
39 | "seaborn>=0.11",
40 | "statsmodels>=0.13",
41 | "tqdm>=4.0",
42 | "umap-learn>=0.5.3",
43 | "xlrd>=2.0"
44 | ]
45 | requires-python = ">=3.9"
46 |
47 | [project.urls]
48 | Homepage = "https://github.com/griquelme/tidyms"
49 |
50 | [tool.mypy]
51 | python_version = "3.9"
52 | files = ["src/tidyms"]
53 |
54 | [tool.pytest.ini_options]
55 | pythonpath = [
56 | ".", "./src"
57 | ]
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | bokeh>=3.0.3
2 | Cerberus>=1.3
3 | ipython>=8.1
4 | joblib>=1.1
5 | matplotlib>=3.5.1
6 | networkx>=3.0
7 | numpy>=1.22
8 | openpyxl>=3.0
9 | pandas>=1.5.3
10 | requests
11 | scikit-learn>=1.0.2
12 | scipy>=1.8
13 | seaborn>=0.12
14 | statsmodels>=0.13
15 | tqdm>=4.0
16 | xlrd>=2.0
17 |
18 | plotnine>=0.10.1
19 | natsort>=8.2.0
20 | beautifulsoup4>=4.11.2
21 | dill>=0.3.6
22 | umap-learn>=0.5.3
--------------------------------------------------------------------------------
/src/tidyms/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | TidyMS
3 | ======
4 |
5 | A package to work with Mass Spectrometry data from Metabolomics Experiments.
6 |
7 | Provides
8 | 1. The Assay object to process datasets from raw data.
9 | 2. The MSData object to work with raw data.
10 | 3. The DataContainer object to store metabolomics data sets.
11 | 4. Pipeline and Processor objects to perform curation of data sets.
12 |
13 | """
14 |
15 | from . import chem
16 | from . import fileio
17 | from . import container
18 | from . import utils
19 | from . import peaks
20 | from . import filter
21 | from . import lcms
22 | from . import simulation
23 | from . import raw_data_utils
24 | from . import _mzml
25 | from . import _build_data_matrix
26 | from . import correspondence
27 | from . import fill_missing
28 | from . import consensus_annotation
29 | from .container import DataContainer
30 | from .fileio import MSData
31 | from .lcms import Chromatogram, MSSpectrum
32 | from .assay import Assay
33 | from .raw_data_utils import *
34 | from . import dartms
35 | from .annotation import annotation
36 |
37 | utils.create_tidyms_dir()
38 | SETTINGS = utils.get_settings()
39 |
40 | if SETTINGS["bokeh"]["apply_theme"]:
41 | from bokeh.themes import Theme as _Theme
42 | from bokeh.io import curdoc as _curdoc
43 | theme = SETTINGS["bokeh"]["theme"]
44 | _curdoc().theme = _Theme(json=theme)
45 |
46 | if utils.is_notebook():
47 | from bokeh.plotting import output_notebook as _output_notebook
48 |
49 | _output_notebook()
50 |
--------------------------------------------------------------------------------
/src/tidyms/_constants.py:
--------------------------------------------------------------------------------
1 | from typing import Final, List
2 |
3 |
4 | # separation modes
5 | HPLC: Final[str] = "hplc"
6 | UPLC: Final[str] = "uplc"
7 | DART: Final[str] = "None/DART"
8 | LC_MODES: Final[List[str]] = [UPLC, HPLC, DART]
9 | SEPARATION_MODES: Final[List[str]] = LC_MODES + []
10 |
11 | # instruments
12 | QTOF: Final[str] = "qtof"
13 | ORBITRAP: Final[str] = "orbitrap"
14 | MS_INSTRUMENTS: Final[List[str]] = [QTOF, ORBITRAP]
15 |
16 | # MS mode
17 | CENTROID: Final[str] = "centroid"
18 | PROFILE: Final[str] = "profile"
19 | MS_MODES: Final[List[str]] = [CENTROID, PROFILE]
20 |
21 | # Data loading
22 | MEMORY: Final[str] = "memory"
23 | INFILE: Final[str] = "file"
24 | SIMULATED: Final[str] = "simulated"
25 | DATA_LOAD_MODES: Final[List[str]] = [MEMORY, INFILE, SIMULATED]
26 | DEFAULT_DATA_LOAD_MODE = INFILE
27 |
28 | # feature descriptors
29 | FEATURE: Final[str] = "feature"
30 | MZ: Final[str] = "mz"
31 | RT_START: Final[str] = "rt start"
32 | RT_END: Final[str] = "rt end"
33 | RT: Final[str] = "rt"
34 | RT_STD: Final[str] = "rt std"
35 | AREA: Final[str] = "area"
36 | WIDTH: Final[str] = "width"
37 | HEIGHT: Final[str] = "height"
38 | SNR: Final[str] = "snr"
39 | MZ_STD: Final[str] = "mz_std"
40 | ROI_INDEX: Final[str] = "roi_index"
41 | FT_INDEX: Final[str] = "ft_index"
42 | MERGED: Final[str] = "merged"
43 |
44 | # chromatogram names
45 | BASELINE: Final[str] = "baseline"
46 | NOISE: Final[str] = "noise"
47 | SPINT: Final[str] = "spint" # spectral intensity
48 | ROI_FEATURE_LIST: Final[str] = "features"
49 | TIME: Final[str] = "time"
50 | SCAN: Final[str] = "scan"
51 | MODE: Final[str] = "mode"
52 |
53 | # peak names
54 | START: Final[str] = "start"
55 | APEX: Final[str] = "apex"
56 | END: Final[str] = "end"
57 |
58 | # isotopologue envelope annotation
59 | ENVELOPE_LABEL: Final[str] = "envelope_label"
60 | ENVELOPE_INDEX: Final[str] = "envelope_index"
61 | CHARGE: Final[str] = "charge"
62 |
63 | # sample metadata
64 | SAMPLE: Final[str] = "sample"
65 | CLASS: Final[str] = "class"
66 | ORDER: Final[str] = "order"
67 | BATCH: Final[str] = "batch"
68 | LABEL: Final[str] = "cluster"
69 | ID: Final[str] = "id"
70 | DILUTION: Final[str] = "dilution"
71 | TYPE: Final[str] = "type"
72 |
73 | # sample types
74 | QC_TYPE: Final[str] = "qc"
75 | DQC_TYPE: Final[str] = "dqc"
76 | STUDY_TYPE: Final[str] = "sample"
77 | BLANK_TYPE: Final[str] = "blank"
78 | SAMPLE_TYPES: Final[list[str]] = [QC_TYPE, STUDY_TYPE, BLANK_TYPE, DQC_TYPE]
79 |
80 |
81 | # assay file and dir names
82 | ROI_DIR: Final[str] = "roi"
83 | FT_DIR: Final[str] = "feature"
84 | MANAGER_FILENAME: Final[str] = "metadata.pickle"
85 | FT_TABLE_FILENAME: Final[str] = "feature-table.pickle"
86 | DATA_MATRIX_FILENAME: Final[str] = "data-matrix.pickle"
87 |
88 | # preprocessing steps
89 | DETECT_FEATURES: Final[str] = "detect_features"
90 | EXTRACT_FEATURES: Final[str] = "extract_features"
91 | DESCRIBE_FEATURES: Final[str] = "describe_features"
92 | ANNOTATE_ISOTOPOLOGUES: Final[str] = "annotate_isotopologues"
93 | ANNOTATE_ADDUCTS: Final[str] = "annotate_adducts"
94 | BUILD_FEATURE_TABLE: Final[str] = "build_feature_table"
95 | MATCH_FEATURES: Final[str] = "match_features"
96 | MAKE_DATA_MATRIX: Final[str] = "make_data_matrix"
97 | FILL_MISSING: Final[str] = "fill_missing"
98 |
99 | PREPROCESSING_STEPS: Final[List[str]] = [
100 | DETECT_FEATURES,
101 | EXTRACT_FEATURES,
102 | DESCRIBE_FEATURES,
103 | ANNOTATE_ISOTOPOLOGUES,
104 | ANNOTATE_ADDUCTS,
105 | BUILD_FEATURE_TABLE,
106 | MATCH_FEATURES,
107 | MAKE_DATA_MATRIX,
108 | FILL_MISSING,
109 | ]
110 |
--------------------------------------------------------------------------------
/src/tidyms/_plot_bokeh.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import bokeh.plotting
3 | from bokeh.palettes import all_palettes
4 | from bokeh.models import ColumnDataSource, Segment
5 | from .utils import get_settings
6 | from . import _constants as c
7 | from typing import Dict, Generator, List, Optional
8 |
9 |
10 | def get_bokeh_settings():
11 | return get_settings()["bokeh"]
12 |
13 |
14 | def get_theme_params() -> dict:
15 | return get_bokeh_settings()["theme"]
16 |
17 |
18 | def get_line_params() -> dict:
19 | return get_bokeh_settings()["line"]
20 |
21 |
22 | def get_chromatogram_figure_params() -> dict:
23 | return get_bokeh_settings()["chromatogram"]["figure"]
24 |
25 |
26 | def get_spectrum_figure_params() -> dict:
27 | return get_bokeh_settings()["spectrum"]["figure"]
28 |
29 |
30 | def get_varea_params() -> dict:
31 | return get_bokeh_settings()["varea"]
32 |
33 |
34 | def get_palette() -> List[str]:
35 | palette_params = get_bokeh_settings()["palette"]
36 | return find_palette(**palette_params)
37 |
38 |
39 | def make_figure(fig_params: Optional[dict]):
40 | if fig_params is None:
41 | fig_params = dict()
42 | return bokeh.plotting.figure(**fig_params)
43 |
44 |
45 | def find_palette(name: str, size: Optional[int] = None) -> List[str]:
46 | try:
47 | palette = bokeh.palettes.all_palettes[name]
48 | # by default get the palette with the largest size
49 | if size is None:
50 | size = max(list(palette.keys()))
51 | palette = palette[size]
52 | except KeyError:
53 | link = "https://docs.bokeh.org/en/latest/docs/reference/palettes.html"
54 | msg = "Palette not found. Refer to the list of prebuilt palettes at {}"
55 | raise ValueError(msg.format(link))
56 | return palette
57 |
58 |
59 | def palette_cycler(palette: List[str]) -> Generator[str, None, None]:
60 | ind = 0
61 | size = len(palette)
62 | while True:
63 | yield palette[ind]
64 | ind = (ind + 1) % size
65 |
66 |
67 | def add_line(
68 | figure: bokeh.plotting.figure,
69 | x: np.ndarray,
70 | y: np.ndarray,
71 | line_params: Optional[dict] = None
72 | ):
73 | """
74 | Plots a line.
75 |
76 | Parameters
77 | ----------
78 | figure : bokeh.plotting.figure
79 | key-value parameters to pass into bokeh figure function.
80 | x : array
81 | y : array
82 | line_params : dict
83 | key-value parameters to pass into bokeh line function.
84 |
85 | """
86 | default_line_params = get_line_params()
87 | if line_params:
88 | default_line_params.update(line_params)
89 | line_params = default_line_params
90 | figure.line(x, y, **line_params)
91 |
92 |
93 | def set_chromatogram_axis_params(fig: bokeh.plotting.figure):
94 | bokeh_settings = get_bokeh_settings()
95 | xaxis_params = bokeh_settings["chromatogram"]["xaxis"]
96 | yaxis_params = bokeh_settings["chromatogram"]["yaxis"]
97 | fig.xaxis.update(**xaxis_params)
98 | fig.yaxis.update(**yaxis_params)
99 |
100 |
101 | def set_ms_spectrum_axis_params(fig: bokeh.plotting.figure):
102 | bokeh_settings = get_bokeh_settings()
103 | xaxis_params = bokeh_settings["spectrum"]["xaxis"]
104 | yaxis_params = bokeh_settings["spectrum"]["yaxis"]
105 | fig.xaxis.update(**xaxis_params)
106 | fig.yaxis.update(**yaxis_params)
107 |
108 |
109 | def fill_area(
110 | figure: bokeh.plotting.figure,
111 | x: np.ndarray,
112 | y: np.ndarray,
113 | start: int,
114 | end: int,
115 | color: str,
116 | **varea_params,
117 | ):
118 | default_varea_params = get_varea_params()
119 | if varea_params:
120 | default_varea_params.update(varea_params)
121 | varea_params = default_varea_params
122 |
123 | xp = x[start:end]
124 | yp = y[start:end]
125 | figure.varea(xp, yp, 0, fill_color=color, **varea_params)
126 |
127 |
128 | def add_stems(
129 | fig: bokeh.plotting.figure,
130 | x: np.ndarray,
131 | y: np.ndarray,
132 | line_params: Optional[Dict] = None
133 | ):
134 | default_line_params = get_line_params()
135 | if line_params:
136 | default_line_params.update(line_params)
137 | line_params = default_line_params
138 | x0 = x
139 | y0 = np.zeros_like(y)
140 | source = ColumnDataSource(dict(x0=x0, x1=x, y0=y0, y1=y))
141 | stems = Segment(x0="x0", x1="x1", y0="y0", y1="y1", **line_params)
142 | fig.add_glyph(source, stems)
143 |
144 |
145 | class _LCAssayPlotter: # pragma: no cover
146 | """
147 | Methods to plot data from an Assay. Generates Bokeh Figures.
148 |
149 | Methods
150 | -------
151 | roi(sample: str) :
152 | m/z vs Rt view of the ROI and features in a sample.
153 | stacked_chromatogram(feature: int) :
154 | Overlapped chromatographic peaks for a feature in all samples
155 |
156 | """
157 | def __init__(self, assay):
158 | self.assay = assay
159 | self.roi_index = None
160 | self.ft_index = None
161 |
162 | def _build_roi_index_table(self):
163 | ft_table = self.assay.feature_table.copy()
164 | ft_table = ft_table[ft_table[c.LABEL] > -1]
165 | self.roi_index = (
166 | ft_table.pivot(index=c.SAMPLE, columns=c.LABEL, values=c.ROI_INDEX)
167 | .fillna(-1)
168 | .astype(int)
169 | )
170 |
171 | def _build_peak_index_table(self):
172 | ft_table = self.assay.feature_table.copy()
173 | ft_table = ft_table[ft_table[c.LABEL] > -1]
174 | self.ft_index = (
175 | ft_table.pivot(index=c.SAMPLE, columns=c.LABEL, values=c.FT_INDEX)
176 | .fillna(-1)
177 | .astype(int)
178 | )
179 |
180 | def roi(self, sample: str, show: bool = True) -> bokeh.plotting.figure:
181 | """
182 | Plots m/z vs time dispersion of the ROI in a sample. Detected features
183 | are highlighted using circles.
184 |
185 | Parameters
186 | ----------
187 | sample : str
188 | sample used in the Assay.
189 | show : bool, default=True
190 | If True calls ``bokeh.plotting.show`` on the Figure.
191 |
192 | Returns
193 | -------
194 | bokeh Figure
195 | """
196 | roi = self.assay.load_roi_list(sample)
197 |
198 | TOOLTIPS = [
199 | ("m/z", "@{}".format(c.MZ)),
200 | ("Rt", "@{}".format(c.RT)),
201 | ("area", "@{}".format(c.AREA)),
202 | ("height", "@{}".format(c.HEIGHT)),
203 | ("width", "@{}".format(c.WIDTH)),
204 | ("SNR", "@{}".format(c.SNR)),
205 | ("roi index", "@{}".format(c.ROI_INDEX)),
206 | ("feature index", "@{}".format(c.FT_INDEX))
207 | ]
208 | fig = bokeh.plotting.figure(tooltips=TOOLTIPS)
209 |
210 | rt_list = list()
211 | mz_list = list()
212 | for r in roi:
213 | rt_list.append(r.time)
214 | mz_list.append(r.mz)
215 | line_source = bokeh.plotting.ColumnDataSource(
216 | dict(xs=rt_list, ys=mz_list)
217 | )
218 | line_params = get_line_params()
219 | fig.multi_line(xs="xs", ys="ys", source=line_source, **line_params)
220 |
221 | try:
222 | ft = self.assay.load_features(sample)
223 | source = bokeh.plotting.ColumnDataSource(ft)
224 | fig.circle('rt', 'mz', size=5, source=source)
225 | except ValueError:
226 | pass
227 | fig.xaxis.update(axis_label="Rt [s]")
228 | fig.yaxis.update(axis_label="m/z")
229 | if show:
230 | bokeh.plotting.show(fig)
231 | return fig
232 |
233 | def stacked_chromatogram(
234 | self,
235 | cluster: int,
236 | include_classes: Optional[List[str]] = None,
237 | show: bool = True
238 | ) -> bokeh.plotting.figure:
239 | """
240 | Plots chromatograms of a feature detected across different samples.
241 |
242 | Parameters
243 | ----------
244 | cluster : int
245 | cluster value obtained from feature correspondence.
246 | include_classes : List[str] or None, default=None
247 | List of classes to plot. If None is used, samples from all classes
248 | are plotted.
249 | show : bool, default=True
250 | If True calls ``bokeh.plotting.show`` on the Figure.
251 |
252 | Returns
253 | -------
254 | bokeh Figure
255 |
256 | """
257 | if not self.assay.manager.check_step("match_features"):
258 | msg = "This plot only can be generated after feature matching"
259 | raise ValueError(msg)
260 | else:
261 | if self.ft_index is None:
262 | self._build_peak_index_table()
263 |
264 | if self.roi_index is None:
265 | self._build_roi_index_table()
266 |
267 | fig_params = get_chromatogram_figure_params()
268 | fig = bokeh.plotting.figure(**fig_params)
269 | roi_index = self.roi_index[cluster].to_numpy()
270 | ft_index = self.ft_index[cluster].to_numpy()
271 | samples = self.roi_index.index
272 | # TODO: fix after refactoring DataContainers
273 | classes = self.assay.get_sample_metadata()["class"]
274 | palette = get_palette()
275 | if include_classes is not None:
276 | class_to_color = dict()
277 | for k, cl in enumerate(include_classes):
278 | class_to_color[cl] = palette[k]
279 |
280 | iterator = zip(samples, roi_index, ft_index, classes)
281 | for sample, roi_index, ft_index, class_ in iterator:
282 | check_draw = (
283 | (roi_index > -1) and
284 | ((include_classes is None) or (class_ in include_classes))
285 | )
286 | if check_draw:
287 | if include_classes is None:
288 | color = palette[0]
289 | else:
290 | color = class_to_color[class_]
291 | r = self.assay.load_roi(sample, roi_index)
292 | ft = r.features[ft_index]
293 | add_line(fig, r.time, r.spint)
294 | fill_area(
295 | fig, r.time, r.spint, ft.start, ft.end, color, alpha=0.2)
296 | set_chromatogram_axis_params(fig)
297 | if show:
298 | bokeh.plotting.show(fig)
299 | return fig
300 |
--------------------------------------------------------------------------------
/src/tidyms/annotation/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Annotation
3 | ----------
4 |
5 | Tools for feature annotation.
6 |
7 | Provides:
8 |
9 | 1. Tools for isotopologue annotation.
10 |
11 | Functions
12 | ---------
13 | annotate
14 | create_annotation_table
15 | create_annotation_tools
16 |
17 |
18 | """
19 |
20 | from .annotation import annotate, create_annotation_table, create_annotation_tools
21 |
22 | __all__ = ["annotate", "create_annotation_tools", "create_annotation_table"]
23 |
--------------------------------------------------------------------------------
/src/tidyms/annotation/annotation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from typing import Sequence
4 | from .annotation_data import AnnotationData
5 | from .envelope_finder import EnvelopeFinder
6 | from .mmi_finder import MMIFinder
7 | from ..lcms import Feature
8 | from ..chem import EnvelopeValidator
9 | from ..chem.atoms import EM, PeriodicTable
10 | from .. import _constants as c
11 |
12 |
13 | def create_annotation_table(feature_list: list[Feature]) -> pd.DataFrame:
14 | d: dict[str, list[int]] = {
15 | c.ROI_INDEX: list(),
16 | c.FT_INDEX: list(),
17 | c.ENVELOPE_INDEX: list(),
18 | c.ENVELOPE_LABEL: list(),
19 | c.CHARGE: list(),
20 | }
21 |
22 | for ft in feature_list:
23 | annotation = ft.annotation
24 | d[c.CHARGE].append(annotation.charge)
25 | d[c.ENVELOPE_INDEX].append(annotation.isotopologue_index)
26 | d[c.ENVELOPE_LABEL].append(annotation.isotopologue_label)
27 | d[c.ROI_INDEX].append(ft.roi.index)
28 | d[c.FT_INDEX].append(ft.index)
29 |
30 | return pd.DataFrame(d)
31 |
32 |
33 | def create_annotation_tools(
34 | bounds: dict[str, tuple[int, int]],
35 | max_mass: float,
36 | max_charge: int,
37 | max_length: int,
38 | min_M_tol: float,
39 | max_M_tol: float,
40 | p_tol: float,
41 | min_similarity: float,
42 | min_p: float,
43 | ) -> tuple[MMIFinder, EnvelopeFinder, EnvelopeValidator]:
44 | """
45 | Create an annotator object. Auxiliary function to _annotate
46 |
47 | Parameters
48 | ----------
49 | bounds : Dict
50 | A dictionary of expected elements to minimum and maximum formula coefficients.
51 | max_mass : float
52 | Maximum exact mass of the features.
53 | max_charge : int
54 | Maximum charge of the features. Use negative values for negative polarity.
55 | max_length : int
56 | Maximum length of the envelopes.
57 | min_M_tol : float
58 | Minimum mass tolerance used during search. isotopologues with abundance
59 | equal to 1 use this value. Isotopologues with abundance equal to 0 use
60 | `max_M_tol`. For values in between, a weighted tolerance is used based
61 | on the abundance.
62 | max_M_tol : float
63 | p_tol : float
64 | Abundance tolerance.
65 | min_similarity : float
66 | Minimum cosine similarity between a pair of features
67 | min_p : float
68 | Minimum abundance of isotopes to include in candidate search.
69 |
70 | Returns
71 | -------
72 | annotator: _IsotopologueAnnotator
73 |
74 | """
75 | # remove elements with only 1 stable isotope
76 | p_table = PeriodicTable()
77 | bounds = {k: bounds[k] for k in bounds if len(p_table.get_element(k).isotopes) > 1}
78 |
79 | bin_size = 100
80 | elements = list(bounds)
81 | mmi_finder = MMIFinder(
82 | bounds,
83 | max_mass,
84 | max_charge,
85 | max_length,
86 | bin_size,
87 | max_M_tol,
88 | p_tol,
89 | min_similarity,
90 | )
91 | envelope_finder = EnvelopeFinder(elements, max_M_tol, max_length, min_p, min_similarity)
92 | envelope_validator = EnvelopeValidator(
93 | bounds,
94 | max_M=max_mass,
95 | max_length=max_length,
96 | min_M_tol=min_M_tol,
97 | max_M_tol=max_M_tol,
98 | p_tol=p_tol,
99 | )
100 | return mmi_finder, envelope_finder, envelope_validator
101 |
102 |
103 | def annotate(
104 | feature_list: list[Feature],
105 | mmi_finder: MMIFinder,
106 | envelope_finder: EnvelopeFinder,
107 | envelope_validator: EnvelopeValidator,
108 | ) -> None:
109 | """
110 | Annotate isotopologues in a sample.
111 |
112 | Annotations are added to the `annotation` attribute of each feature.
113 |
114 | Parameters
115 | ----------
116 | feature_list : List[LCTrace]
117 | List of features obtained after feature extraction.
118 | mmi_finder : MMIFinder
119 | envelope_finder : EnvelopeFinder
120 | envelope_validator : EnvelopeValidator
121 |
122 | """
123 | data = AnnotationData(feature_list)
124 | monoisotopologue = data.get_monoisotopologue()
125 | polarity = mmi_finder.polarity
126 | while monoisotopologue is not None:
127 | mmi_candidates = mmi_finder.find(data)
128 | envelope, charge = find_best_envelope(
129 | data,
130 | monoisotopologue,
131 | polarity,
132 | mmi_candidates,
133 | envelope_finder,
134 | envelope_validator,
135 | )
136 | data.annotate(envelope, charge)
137 | monoisotopologue = data.get_monoisotopologue()
138 |
139 |
140 | def find_best_envelope(
141 | data: AnnotationData,
142 | monoisotopologue: Feature,
143 | polarity: int,
144 | mmi_candidates: Sequence[tuple[Feature, int]],
145 | envelope_finder: EnvelopeFinder,
146 | envelope_validator: EnvelopeValidator,
147 | ) -> tuple[Sequence[Feature], int]:
148 | best_length = 1
149 | best_candidate = [monoisotopologue]
150 | best_charge = -1
151 | for mmi, charge in mmi_candidates:
152 | envelope_candidates = envelope_finder.find(data, mmi, charge)
153 | for candidate in envelope_candidates:
154 | validated_length = _validate_candidate(
155 | candidate,
156 | monoisotopologue,
157 | charge,
158 | polarity,
159 | best_length,
160 | envelope_validator,
161 | )
162 | if validated_length > best_length:
163 | best_length = validated_length
164 | best_candidate = candidate[:validated_length]
165 | best_charge = charge
166 | return best_candidate, best_charge
167 |
168 |
169 | def _validate_candidate(
170 | candidate: Sequence[Feature],
171 | monoisotopologue: Feature,
172 | charge: int,
173 | polarity: int,
174 | min_length: int,
175 | validator: EnvelopeValidator,
176 | ) -> int:
177 | if len(candidate) <= min_length:
178 | return 0
179 |
180 | if monoisotopologue not in candidate:
181 | return 0
182 |
183 | M, p = candidate[0].compute_isotopic_envelope(candidate)
184 | em_correction = EM * charge * polarity
185 | M = np.array(M) * charge - em_correction
186 | p = np.array(p)
187 | return validator.validate(M, p)
188 |
--------------------------------------------------------------------------------
/src/tidyms/annotation/annotation_data.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 | from ..lcms import Feature
3 | from collections.abc import Sequence
4 |
5 |
6 | class AnnotationData:
7 | """
8 | Feature data.
9 |
10 | Attributes
11 | ----------
12 | features : list[Feature]
13 | List of features sorted by m/z
14 | annotation : dict[Feature, int]
15 | Annotation of features
16 | similarity_cache : SimilarityCache
17 | Stores similarity between features.
18 | non_annotated : set[Feature]
19 | Non-annotated features.
20 |
21 | """
22 |
23 | def __init__(self, features: Sequence[Feature]):
24 | self.features = sorted(features)
25 | self.non_annotated = set(features)
26 | self._monoisotopologues = sorted(features, key=lambda x: x.height)
27 | self.similarity_cache = SimilarityCache()
28 | self._label_counter = 0
29 |
30 | def get_monoisotopologue(self) -> Optional[Feature]:
31 | """Gets the current non-annotated feature with the greatest area."""
32 | if self._monoisotopologues:
33 | mono = self._monoisotopologues[-1]
34 | while mono not in self.non_annotated:
35 | self._monoisotopologues.pop()
36 | if self._monoisotopologues:
37 | mono = self._monoisotopologues[-1]
38 | else:
39 | mono = None
40 | else:
41 | mono = None
42 | return mono
43 |
44 | def annotate(self, features: Sequence[Feature], charge: int):
45 | """Labels a list of features as an isotopic envelope."""
46 | if len(features) > 1:
47 | for k, ft in enumerate(features):
48 | ft.annotation.charge = charge
49 | ft.annotation.isotopologue_label = self._label_counter
50 | ft.annotation.isotopologue_index = k
51 | self._flag_annotated(ft)
52 | self._label_counter += 1
53 | else:
54 | self._flag_annotated(features[0])
55 |
56 | def _flag_annotated(self, feature: Feature):
57 | """Flag features as annotated."""
58 | self.non_annotated.discard(feature)
59 | if self._monoisotopologues and (feature == self._monoisotopologues[-1]):
60 | self._monoisotopologues.pop()
61 |
62 |
63 | class SimilarityCache:
64 | """Stores and retrieves the similarity between features in a sample."""
65 |
66 | def __init__(self):
67 | self._cache: dict[Feature, dict[Feature, float]] = dict()
68 |
69 | def get_similarity(self, ft1: Feature, ft2: Feature):
70 | ft1_sim = self._cache.setdefault(ft1, dict())
71 | ft2_sim = self._cache.setdefault(ft2, dict())
72 | if ft2 in ft1_sim:
73 | similarity = ft1_sim[ft2]
74 | else:
75 | similarity = ft1.compare(ft2)
76 | ft1_sim[ft2] = similarity
77 | ft2_sim[ft1] = similarity
78 | return similarity
79 |
--------------------------------------------------------------------------------
/src/tidyms/annotation/envelope_finder.py:
--------------------------------------------------------------------------------
1 | """
2 | Functions to find isotopic envelopes candidates in a list of m/z values.
3 |
4 | """
5 |
6 |
7 | import bisect
8 | from typing import Tuple
9 | from ..chem.atoms import Element, PeriodicTable
10 | from ..lcms import Feature
11 | from .annotation_data import AnnotationData, SimilarityCache
12 | from collections.abc import Sequence
13 |
14 | # name conventions
15 | # M is used for Molecular mass
16 | # m for nominal mass
17 | # p for abundances
18 |
19 |
20 | class EnvelopeFinder(object):
21 | r"""
22 | Find isotopic envelopes candidates in a list of sorted m/z values.
23 |
24 | Attributes
25 | ----------
26 | tolerance : float
27 | tolerance used to extend the element based bounds
28 | max_length : int
29 | max length of the envelopes
30 |
31 | Notes
32 | -----
33 | Using a list of elements, theoretical bounds are computed for each M1, M2,
34 | M3, etc... isotopologue. Then using these values and the `mz_tolerance` and
35 | the `max_charge`, the bounds are adjusted according to the following
36 | equations:
37 |
38 | .. math::
39 |
40 | mz_{k, min}= \frac{m_{k, min}{q} - mz_{tolerance}
41 |
42 | mz_{k, max}= \frac{m_{k, max}{q} + mz_{tolerance}
43 |
44 | where :math:`m_{k, min}` is the minimum theoretical value for the k-th
45 | isotopologue and q is the charge.
46 |
47 | The envelopes candidates found are determined based on m/z compatibility
48 | only. To reduce the number of candidates, the list of m/z values should be
49 | reduced by other means, such as correlation of the values.
50 |
51 | """
52 |
53 | def __init__(
54 | self,
55 | elements: list[str],
56 | mz_tolerance: float,
57 | max_length: int = 5,
58 | min_p: float = 0.01,
59 | min_similarity: float = 0.9,
60 | ):
61 | """
62 |
63 | Parameters
64 | ----------
65 | elements : List[str]
66 | List of elements used to compute mass difference windows.
67 | mz_tolerance : float
68 | m/z tolerance used to match candidates.
69 | max_length : int, default=5
70 | Maximum envelope length to search.
71 | min_p : number between 0 and 1.
72 | The minimum abundance of the isotopes of each element to be used for m/z estimation.
73 | min_similarity : float, default=0.9
74 | Minimum similarity to create candidates.
75 |
76 | """
77 |
78 | el_list = [PeriodicTable().get_element(x) for x in elements]
79 | self.tolerance = mz_tolerance
80 | self.max_length = max_length
81 | self.min_similarity = min_similarity
82 | self.bounds = _make_exact_mass_difference_bounds(el_list, min_p)
83 |
84 | def find(
85 | self,
86 | data: AnnotationData,
87 | mmi: Feature,
88 | charge: int,
89 | ) -> list[Sequence[Feature]]:
90 | """
91 | Finds isotopic envelope candidates starting from the minimum mass
92 | isotopologue (MMI).
93 |
94 | Parameters
95 | ----------
96 | data : AnnotationData
97 | List of features sorted by m/z.
98 | mmi : Feature
99 | Minimum Mass feature.
100 | non_annotated : set[Feature]
101 | Non annotated features
102 | charge : int
103 | Absolute value of the charge state of the isotopic envelope
104 |
105 | Returns
106 | -------
107 | envelopes: list[list[Feature]]
108 | List of isotopic envelope candidates.
109 |
110 | """
111 | envelopes = _find_envelopes(
112 | data.features,
113 | mmi,
114 | data.non_annotated,
115 | data.similarity_cache,
116 | charge,
117 | self.max_length,
118 | self.tolerance,
119 | self.min_similarity,
120 | self.bounds,
121 | )
122 | envelopes = _remove_sub_candidates(envelopes)
123 | return envelopes
124 |
125 |
126 | def _remove_sub_candidates(
127 | candidates: list[Sequence[Feature]],
128 | ) -> list[Sequence[Feature]]:
129 | """Remove candidates that are subsets of other candidates."""
130 | validated = list()
131 | while candidates:
132 | last = candidates.pop()
133 | last_set = set(last)
134 | is_subset = False
135 | for candidate in candidates:
136 | is_subset = last_set <= set(candidate)
137 | if not is_subset:
138 | validated.append(last)
139 | return validated
140 |
141 |
142 | def _find_envelopes(
143 | features: Sequence[Feature],
144 | mmi: Feature,
145 | non_annotated: set[Feature],
146 | cache: SimilarityCache,
147 | charge: int,
148 | max_length: int,
149 | mz_tolerance: float,
150 | min_similarity: float,
151 | bounds: dict[int, Tuple[float, float]],
152 | ) -> list[Sequence[Feature]]:
153 | """
154 |
155 | Finds isotopic envelope candidates using multiple charge states.
156 |
157 | Parameters
158 | ----------
159 | features: list[Feature]
160 | List of features sorted by m/z.
161 | mmi: Feature
162 | Minimum Mass feature.
163 | non_annotated: set[Feature]
164 | Non annotated features
165 | charge: int
166 | Absolute value of the charge state of the isotopic envelope
167 | max_length: int
168 | maximum length ot the isotope candidates
169 | mz_tolerance: float
170 | min_similarity : float, default=0.9
171 | Minimum similarity to create candidates.
172 | bounds: dict
173 | bounds obtained with _make_m_bounds
174 |
175 | Returns
176 | -------
177 | envelopes:
178 | List where each element is a list of indices with isotopic envelopes
179 | candidates.
180 |
181 | """
182 | completed_candidates = list()
183 | candidates = [[mmi]]
184 | while candidates:
185 | # remove and extend a candidate
186 | candidate = candidates.pop()
187 |
188 | # find features with compatible m/z and similarities
189 | min_mz, max_mz = _get_next_mz_search_interval(
190 | candidate, bounds, charge, mz_tolerance
191 | )
192 | start = bisect.bisect(features, min_mz)
193 | end = bisect.bisect(features, max_mz)
194 | new_features = list()
195 | for k in range(start, end):
196 | k_ft = features[k]
197 | is_similar = cache.get_similarity(mmi, k_ft) >= min_similarity
198 | is_non_annotated = k_ft in non_annotated
199 | if is_similar and is_non_annotated:
200 | new_features.append(k_ft)
201 |
202 | # extend candidates with compatible features
203 | length = len(candidate)
204 | if new_features and (length < max_length):
205 | tmp = [candidate + [x] for x in new_features]
206 | candidates.extend(tmp)
207 | else:
208 | completed_candidates.append(candidate)
209 | completed_candidates = [x for x in completed_candidates if len(x) > 1]
210 | return completed_candidates
211 |
212 |
213 | def _get_next_mz_search_interval(
214 | envelope: Sequence[Feature],
215 | elements_mass_difference: dict[int, Tuple[float, float]],
216 | charge: int,
217 | mz_tolerance: float,
218 | ) -> Tuple[float, float]:
219 | """
220 | Computes the valid m/z range for a k-th isotopologue using information from
221 | m/z values from previous isotopologues.
222 |
223 | Parameters
224 | ----------
225 | mz: sorted list
226 | List of previous found m/z values
227 | elements_mass_difference: dict
228 | bounds obtained with _make_m bounds
229 | charge: int
230 | mz_tolerance: float
231 |
232 | Returns
233 | -------
234 | min_mz: minimum mz value for the M + k isotopologue
235 | max_mz: maximum mz value for the M + K isotopologue
236 |
237 | """
238 |
239 | # If the charge is 0 (neutral mass) the results are the same as using
240 | # charge = 1. There is no difference between positive and negative
241 | # charges
242 | charge = max(1, abs(charge))
243 | length = len(envelope)
244 | min_mz = envelope[-1].mz + 2 # dummy values
245 | max_mz = envelope[-1].mz - 2
246 | for dm, (min_dM, max_dM) in elements_mass_difference.items():
247 | i = length - dm
248 | if i >= 0:
249 | min_mz = min(min_mz, envelope[i].mz + min_dM / charge)
250 | max_mz = max(max_mz, envelope[i].mz + max_dM / charge)
251 | min_mz -= mz_tolerance
252 | max_mz += mz_tolerance
253 | return min_mz, max_mz
254 |
255 |
256 | def _make_exact_mass_difference_bounds(
257 | elements: list[Element], min_p: float
258 | ) -> dict[int, Tuple[float, float]]:
259 | """
260 | Computes possible mass differences obtaining from changing one isotope.
261 |
262 | Parameters
263 | ----------
264 | elements: list of Elements
265 | min_p: number between 0 and 1.
266 | Minimum abundance of the isotopes used.
267 |
268 | Returns
269 | -------
270 | bounds: dict
271 | mapping of possible nominal mass increments to exact mass increments,
272 | used by _get_k_bounds to estimate valid m/z ranges for isotopologues.
273 |
274 | """
275 | bounds = dict()
276 | for e in elements:
277 | m, M, p = e.get_abundances()
278 | for i in range(1, M.size):
279 | if p[i] > min_p:
280 | dm = m[i] - m[0]
281 | dM = M[i] - M[0]
282 | dM_list = bounds.get(dm)
283 | if dM_list is None:
284 | bounds[dm] = [dM]
285 | else:
286 | dM_list.append(dM)
287 |
288 | for dm in bounds:
289 | bounds[dm] = min(bounds[dm]), max(bounds[dm])
290 | return bounds
291 |
--------------------------------------------------------------------------------
/src/tidyms/annotation/mmi_finder.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import bisect
3 | from typing import Optional
4 | from .annotation_data import AnnotationData
5 | from ..chem.atoms import Element, PeriodicTable, EM
6 | from ..chem._formula_generator import FormulaCoefficientBounds
7 | from ..chem.envelope_tools import make_formula_coefficients_envelopes
8 | from ..lcms import Feature
9 |
10 |
11 | class MMIFinder:
12 | """
13 | Finds Minimum Mass Isotopologue (MMI) candidates using an array of feature
14 | m/z and an array of feature area.
15 |
16 | """
17 |
18 | def __init__(
19 | self,
20 | bounds: dict[str, tuple[int, int]],
21 | max_mass: float,
22 | max_charge: int,
23 | length: int,
24 | bin_size: int,
25 | mz_tol: float,
26 | p_tol: float,
27 | min_similarity: float,
28 | custom_abundances: Optional[dict[str, np.ndarray]] = None,
29 | ):
30 | """
31 | Constructor method.
32 |
33 | Parameters
34 | ----------
35 | bounds : dict
36 | Mapping from an element symbol str to the minimum and maximum
37 | allowed values in formulas.
38 | max_mass : float
39 | Maximum mass to build rules.
40 | length : int
41 | length of the theoretical envelopes used to compute the search
42 | rules.
43 | bin_size : int
44 | Mass interval used to build the rules.
45 | mz_tol : float
46 | m/z tolerance to search candidates.
47 | p_tol : float
48 | abundance tolerance used to search candidates.
49 | min_similarity : float, default=0.9
50 | Minimum similarity to create candidates.
51 | custom_abundances : dict, optional
52 | Provides custom elemental abundances. A mapping from element
53 | symbols str to an abundance array. The abundance array must have
54 | the same size that the natural abundance and its sum must be equal
55 | to one. For example, for "C", an alternative abundance can be
56 | array([0.15, 0.85]) for isotopes with nominal mass 12 and 13.
57 |
58 | """
59 | self.rules = _create_rules_dict(
60 | bounds, max_mass, length, bin_size, p_tol, custom_abundances
61 | )
62 | self.bin_size = bin_size
63 | self.max_charge = abs(max_charge)
64 | self.polarity = 1 if max_charge >= 0 else -1
65 | self.max_mass = max_mass
66 | self.mz_tol = mz_tol
67 | self.p_tol = p_tol
68 | self.min_similarity = min_similarity
69 |
70 | def find(self, data: AnnotationData) -> list[tuple[Feature, int]]:
71 | """
72 | Search MMI candidates using m/z and area information from a feature
73 | list.
74 |
75 | Parameters
76 | ----------
77 | features : list[Features]
78 | list of features sorted by m/z.
79 | mono: Feature
80 | Candidate to monoisotopic feature.
81 |
82 | Returns
83 | -------
84 | mmi_candidates: list[tuple[int, int]]
85 | list of candidates assuming that the monoisotopic index is part of
86 | the envelope but not the MMI.
87 |
88 | """
89 | mono = data.get_monoisotopologue()
90 | candidates = list()
91 |
92 | if mono is None:
93 | return candidates
94 |
95 | for charge in range(1, self.max_charge + 1):
96 | M_mono = mono.mz * charge - self.polarity * charge * EM
97 | if M_mono < self.max_mass:
98 | candidates.append((mono, charge))
99 | M_bin = int(M_mono // self.bin_size)
100 | mmi_rules = self.rules.get(M_bin)
101 | if mmi_rules is not None:
102 | for i_rules in mmi_rules:
103 | i_candidates = _find_candidate(
104 | data,
105 | mono,
106 | charge,
107 | i_rules,
108 | self.mz_tol,
109 | self.p_tol,
110 | self.max_mass,
111 | self.min_similarity,
112 | )
113 | candidates.extend(i_candidates)
114 | return candidates
115 |
116 |
117 | def _find_candidate(
118 | data: AnnotationData,
119 | mono: Feature,
120 | charge: int,
121 | i_rules: dict,
122 | mz_tol: float,
123 | p_tol: float,
124 | max_mass: float,
125 | min_similarity: float,
126 | ) -> list[tuple[int, int]]:
127 | # search valid m/z values
128 | min_dM, max_dM = i_rules["dM"]
129 | min_mz = mono.mz - max_dM / charge - mz_tol
130 | max_mz = mono.mz - min_dM / charge + mz_tol
131 | min_qp = i_rules["qp"][0] - p_tol
132 | max_qp = i_rules["qp"][1] + p_tol
133 |
134 | if (mono.mz * charge) < max_mass:
135 | start = bisect.bisect(data.features, min_mz)
136 | end = bisect.bisect(data.features, max_mz)
137 | else:
138 | start, end = 0, 0 # dummy values
139 |
140 | # if valid m/z where found, check if the abundance quotient qp is valid
141 | candidates = list()
142 | if start < end:
143 | for k in range(start, end):
144 | candidate = data.features[k]
145 | is_valid = _check_candidate(data, mono, candidate, min_similarity, min_qp, max_qp)
146 | if is_valid:
147 | candidates.append((candidate, charge))
148 | return candidates
149 |
150 |
151 | def _check_candidate(
152 | data: AnnotationData,
153 | mono: Feature,
154 | candidate: Feature,
155 | min_similarity: float,
156 | min_qp: float,
157 | max_qp: float,
158 | ) -> bool:
159 | if candidate not in data.non_annotated:
160 | return False
161 |
162 | similarity = data.similarity_cache.get_similarity(mono, candidate)
163 |
164 | if similarity < min_similarity:
165 | return False
166 |
167 | mmi_mono_pair = [candidate, mono]
168 | _, p = mono.compute_isotopic_envelope(mmi_mono_pair)
169 | qp = p[1] / p[0]
170 | is_valid_qp = (qp >= min_qp) & (qp <= max_qp)
171 |
172 | return is_valid_qp
173 |
174 |
175 | def _create_rules_dict(
176 | bounds: dict[str, tuple[int, int]],
177 | max_mass: float,
178 | length: int,
179 | bin_size: int,
180 | p_tol: float,
181 | custom_abundances: Optional[dict[str, np.ndarray]],
182 | ) -> dict[int, list[dict[str, tuple[float, float]]]]:
183 | Ma, pa = _create_envelope_arrays(bounds, max_mass, length, custom_abundances)
184 | # find the monoisotopic index, its Mass difference with the MMI (dM) and
185 | # its abundance quotient with the MMI (qp)
186 | bins = (Ma[:, 0] // bin_size).astype(int)
187 |
188 | # find unique values for bins and monoisotopic index that will be used
189 | # as key for the rule dictionary
190 | unique_bins = np.unique(bins)
191 | # unique_mono_index = np.unique(mono_index)
192 | # unique_mono_index = unique_mono_index[unique_mono_index > 0]
193 |
194 | rules = dict()
195 | for b in unique_bins:
196 | b_rules = list()
197 | bin_mask = bins == b
198 | for mi in range(1, length):
199 | qp = pa[bin_mask, mi] / pa[bin_mask, 0]
200 | dM = Ma[bin_mask, mi] - Ma[bin_mask, 0]
201 | qp_mask = qp >= (1.0 - p_tol)
202 | if qp_mask.any():
203 | mi_rules = dict()
204 | dM_b_mi = dM[qp_mask]
205 | qp_b_mi = qp[qp_mask]
206 | mi_rules["dM"] = dM_b_mi.min(), dM_b_mi.max()
207 | mi_rules["qp"] = qp_b_mi.min(), qp_b_mi.max()
208 | b_rules.append(mi_rules)
209 | if b_rules:
210 | rules[b] = b_rules
211 | return rules
212 |
213 |
214 | def _create_envelope_arrays(
215 | bounds: dict[str, tuple[int, int]],
216 | M_max: float,
217 | max_length: int,
218 | custom_abundances: Optional[dict[str, np.ndarray]],
219 | ) -> tuple[np.ndarray, np.ndarray]:
220 | elements = _select_elements(list(bounds), custom_abundances)
221 | isotopes = [x.get_mmi() for x in elements]
222 | f_bounds = FormulaCoefficientBounds({x: bounds[x.get_symbol()] for x in isotopes})
223 | coeff = f_bounds.make_coefficients(M_max)
224 | envelope = make_formula_coefficients_envelopes(
225 | bounds, coeff, max_length, custom_abundances
226 | )
227 | M = envelope.M
228 | p = envelope.p
229 | return M, p
230 |
231 |
232 | def _select_two_isotope_element(
233 | e_list: list[str], dm: int, custom_abundances: dict[str, np.ndarray]
234 | ) -> list[str]:
235 | selected = list()
236 | p_dm_max = 0
237 | best_p0_greater_than_pi = None
238 | for s in e_list:
239 | e = PeriodicTable().get_element(s)
240 | n_isotopes = len(e.isotopes)
241 | m, _, p = e.get_abundances()
242 | if n_isotopes == 2:
243 | e_dm = m[-1] - m[0]
244 | if e_dm == dm:
245 | p0, pi = custom_abundances.get(s, p)
246 | if pi > p0:
247 | selected.append(s)
248 | elif pi > p_dm_max:
249 | p_dm_max = pi
250 | best_p0_greater_than_pi = s
251 | if best_p0_greater_than_pi is not None:
252 | selected.append(best_p0_greater_than_pi)
253 | return selected
254 |
255 |
256 | def _select_multiple_isotope_elements(e_list: list[str]) -> list[str]:
257 | selected = list()
258 | for s in e_list:
259 | e = PeriodicTable().get_element(s)
260 | n_isotopes = len(e.isotopes)
261 | if n_isotopes > 2:
262 | selected.append(s)
263 | return selected
264 |
265 |
266 | def _select_elements(
267 | e_list: list[str], custom_abundances: Optional[dict[str, np.ndarray]] = None
268 | ) -> list[Element]:
269 | if custom_abundances is None:
270 | custom_abundances = dict()
271 | two_isotope_dm1 = _select_two_isotope_element(e_list, 1, custom_abundances)
272 | two_isotope_dm2 = _select_two_isotope_element(e_list, 2, custom_abundances)
273 | selected = _select_multiple_isotope_elements(e_list)
274 | if two_isotope_dm1 is not None:
275 | selected.extend(two_isotope_dm1)
276 | if two_isotope_dm2 is not None:
277 | selected.extend(two_isotope_dm2)
278 | selected = [PeriodicTable().get_element(x) for x in selected]
279 | return selected
280 |
--------------------------------------------------------------------------------
/src/tidyms/chem/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Chemistry
3 | =========
4 |
5 | Provides:
6 |
7 | 1. A Formula object to compute the exact mass and isotopic distribution of molecular formulas.
8 | 2. A PeriodicTable with element and isotope information.
9 | 3. A formula generator object to search molecular formulas based on exact mass values.
10 | 4. An EnvelopeScorer that scores the similarity between experimental and theoretical isotopic envelopes.
11 |
12 | Objects
13 | -------
14 | - PeriodicTable
15 | - Formula
16 | - FormulaGenerator
17 | - EnvelopeScorer
18 |
19 | Constants
20 | ---------
21 | - EM : electron mass
22 |
23 | """
24 |
25 | from ._formula_generator import FormulaGenerator, get_chnops_bounds
26 | from .envelope_tools import EnvelopeScorer, EnvelopeValidator
27 | from .formula import Formula
28 | from .atoms import EM, PeriodicTable
29 |
--------------------------------------------------------------------------------
/src/tidyms/chem/_envelope_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Utilities to compute isotopic envelopes
4 |
5 | """
6 |
7 | import numpy as np
8 | from functools import cache
9 | from scipy.stats import multinomial
10 | from typing import Dict, Optional, Tuple
11 | from .atoms import Isotope, PeriodicTable
12 | from . import utils
13 |
14 |
15 | def make_envelope_arrays(
16 | isotope: Isotope, n_min: int, n_max: int, max_length: int, p=None
17 | ) -> Tuple[np.ndarray, np.ndarray]:
18 | """
19 | Creates an array of exact mass and abundance for homonuclear formulas.
20 |
21 | Parameters
22 | ----------
23 | isotope : Isotope
24 | n_min : int
25 | Minimum formula coefficient
26 | n_max : int
27 | Maximum formula coefficient
28 | max_length : int
29 | Length of the envelope
30 | p : array or None, default=None
31 | Element abundance. If None, the natural abundance is used.
32 |
33 | Returns
34 | -------
35 | M : (n_max - n_min + 1, max_length) array
36 | Coefficients exact mass.
37 | p : (n_max - n_min + 1, max_length) array
38 | Coefficients abundance.
39 |
40 |
41 | """
42 | rows = n_max - n_min + 1
43 | M_arr = np.zeros((rows, max_length))
44 | p_arr = np.zeros((rows, max_length))
45 | for k in range(n_min, n_max + 1):
46 | Mk, pk = _get_n_atoms_envelope(isotope, k, max_length, p=p)
47 | M_arr[k - n_min] = Mk
48 | p_arr[k - n_min] = pk
49 | return M_arr, p_arr
50 |
51 |
52 | def find_formula_envelope(
53 | composition: Dict[Isotope, int],
54 | max_length: int,
55 | p: Optional[Dict[str, np.ndarray]] = None,
56 | min_p: float = 1e-10,
57 | ) -> Tuple[np.ndarray, np.ndarray]:
58 | """
59 | Computes the isotopic envelope for a formula.
60 |
61 | """
62 | if p is None:
63 | p = dict()
64 |
65 | # initialize an empty envelope for the formula
66 | Mf = np.zeros((1, max_length), dtype=float)
67 | pf = np.zeros((1, max_length), dtype=float)
68 | pf[0, 0] = 1
69 |
70 | for i, coeff in composition.items():
71 | i_p = p.get(i.get_symbol())
72 | Mi, pi = _get_n_atoms_envelope(i, coeff, max_length, p=i_p)
73 | Mi = Mi.reshape((1, Mi.size))
74 | pi = pi.reshape((1, pi.size))
75 | Mf, pf = combine_envelopes(Mf, pf, Mi, pi)
76 | valid_p_mask = pf >= min_p
77 | pf = pf[valid_p_mask].flatten()
78 | Mf = Mf[valid_p_mask].flatten()
79 | return Mf, pf
80 |
81 |
82 | def combine_envelopes(
83 | M1: np.ndarray,
84 | p1: np.ndarray,
85 | M2: np.ndarray,
86 | p2: np.ndarray,
87 | ) -> Tuple[np.ndarray, np.ndarray]:
88 | """
89 | Combines exact mass and abundance of two envelopes.
90 |
91 | All arrays must be 2-dimensional and have the same shape.
92 |
93 | """
94 | shape = M1.shape
95 | M = np.zeros(shape, dtype=float)
96 | p = np.zeros(shape, dtype=float)
97 | # Ignore zero division errors when normalizing by pk
98 | with np.errstate(divide='ignore', invalid='ignore'):
99 | for k in range(shape[1]):
100 | pk = (p1[:, : k + 1] * p2[:, k::-1]).sum(axis=1)
101 | k1 = k + 1
102 | k2 = k
103 | Mk = (p1[:, :k1] * M1[:, :k1] * p2[:, k2::-1]) + (
104 | p1[:, :k1] * M2[:, k2::-1] * p2[:, k2::-1]
105 | )
106 | M[:, k] = Mk.sum(axis=1) / pk
107 | p[:, k] = pk
108 | np.nan_to_num(M, copy=False)
109 | return M, p
110 |
111 |
112 | def _get_n_atoms_envelope(
113 | isotope: Isotope, n: int, max_length: int, p: Optional[np.ndarray] = None
114 | ) -> Tuple[np.ndarray, np.ndarray]:
115 | """
116 | Computes the nominal mass, exact mass and abundance of n atoms.
117 |
118 | If the isotope is the monoisotope and p is ``None``, the natural abundances
119 | for the element are used.
120 |
121 | If the isotope is the monoisotope and custom abundance `p` is provided, the
122 | envelope is computed using this value instead of the natural abundances.
123 |
124 | If the isotopes is not the monoisotope, it is assumed that only this
125 | isotope contributes to the envelope.
126 |
127 | """
128 | symbol = isotope.get_symbol()
129 | element = PeriodicTable().get_element(symbol)
130 | is_monoisotope = isotope.a == element.get_monoisotope().a
131 | n_isotopes = len(element.isotopes)
132 | if is_monoisotope and (n_isotopes > 1):
133 | if n == 0:
134 | M, p = _get_n_isotopes_envelope(isotope, n, max_length)
135 | elif p is None:
136 | M, p = _get_n_atoms_natural_abundance(symbol, n, max_length)
137 | else:
138 | m, M, _ = element.get_abundances()
139 | _validate_abundance(p, m, symbol)
140 | M, p = _get_n_atoms_envelope_aux(m, M, p, n, max_length)
141 | else:
142 | M, p = _get_n_isotopes_envelope(isotope, n, max_length)
143 | return M, p
144 |
145 |
146 | @cache
147 | def _get_n_atoms_natural_abundance(symbol: str, n: int, max_length: int):
148 | """
149 | Computes the envelope of n atoms using the natural abundance.
150 |
151 | aux function to _get_n_atoms_envelope
152 |
153 | """
154 | m, M, p = PeriodicTable().get_element(symbol).get_abundances()
155 | return _get_n_atoms_envelope_aux(m, M, p, n, max_length)
156 |
157 |
158 | def _get_n_atoms_envelope_aux(
159 | m: np.ndarray, M: np.ndarray, p: np.ndarray, n: int, max_length: int
160 | ) -> Tuple[np.ndarray, np.ndarray]:
161 | """
162 | Computes the envelope of n atoms.
163 |
164 | aux function to _get_n_atoms_envelope.
165 |
166 | """
167 | n_isotopes = p.size
168 | # find combinations of isotopes that sum n
169 | combinations = _find_n_isotope_combination(n_isotopes, n)
170 |
171 | # find m, M and p for each combination of isotopes
172 | multinomial_dist = multinomial(n, p)
173 | m = np.matmul(combinations, m)
174 | M = np.matmul(combinations, M)
175 | p = multinomial_dist.pmf(combinations)
176 |
177 | # sort by exact mass
178 | sorted_index = np.argsort(M)
179 | m, M, p = m[sorted_index], M[sorted_index], p[sorted_index]
180 |
181 | # merge values with the same nominal mass
182 | _, first_occurrence = np.unique(m, return_index=True)
183 | m_unique = np.zeros(max_length, dtype=m.dtype)
184 | M_unique = np.zeros(max_length, dtype=M.dtype)
185 | p_unique = np.zeros(max_length, dtype=p.dtype)
186 | # add the length of m_unique to include all nominal mass values
187 | n_unique = first_occurrence.size
188 | first_occurrence = list(first_occurrence)
189 | first_occurrence.append(m.size)
190 | m0 = m[0]
191 | for k in range(max_length):
192 | if k < n_unique:
193 | start = first_occurrence[k]
194 | end = first_occurrence[k + 1]
195 | mk = m[start]
196 | i = mk - m0
197 | if i < max_length:
198 | m_unique[i] = mk
199 | pk = np.sum(p[start:end])
200 | p_unique[i] = pk
201 | M_unique[i] = np.sum(M[start:end] * p[start:end]) / pk
202 | p_unique = p_unique / np.sum(p_unique)
203 | return M_unique, p_unique
204 |
205 |
206 | def _fill_missing_nominal(
207 | m: np.ndarray, M: np.ndarray, p: np.ndarray, max_length: int
208 | ) -> Tuple[np.ndarray, np.ndarray]:
209 | rel_m = m - m[0]
210 | dm = np.arange(max_length)
211 | M_filled = np.zeros(max_length, dtype=M.dtype)
212 | p_filled = np.zeros(max_length, dtype=p.dtype)
213 | if not np.array_equal(rel_m, dm):
214 | for k, rel_m_k in enumerate(rel_m):
215 | if 0 <= rel_m_k < max_length:
216 | M_filled[rel_m_k] = M[k]
217 | p_filled[rel_m_k] = p[k]
218 | else:
219 | break
220 | M, p = M_filled, p_filled
221 | return M, p
222 |
223 |
224 | def _find_n_isotope_combination(n_isotopes, n):
225 | """
226 | Finds combinations of isotopes such that the sum is n.
227 |
228 | aux function to _find_n_atoms_abundances.
229 |
230 | """
231 | n_ranges = [range(x) for x in ([n + 1] * n_isotopes)]
232 | combinations = utils.cartesian_product(*n_ranges).astype(int)
233 | valid_combinations = combinations.sum(axis=1) == n
234 | combinations = combinations[valid_combinations, :]
235 | return combinations
236 |
237 |
238 | def _validate_abundance(p: np.ndarray, m: np.ndarray, symbol: str):
239 | """
240 | Checks that user-created abundances are non-negative, normalized to 1 and
241 | has the same length as the number of stable isotopes.
242 |
243 | aux function to _get_n_atoms_envelope.
244 |
245 | """
246 | is_all_non_negative = (p >= 0.0).all()
247 | is_normalized = np.isclose(p.sum(), 1.0)
248 | is_same_size = p.size == m.size
249 | if not is_same_size:
250 | msg = "{} has {} stable isotopes. `p` must have the same size."
251 | raise ValueError(msg.format(symbol, m.size))
252 | elif not (is_normalized and is_all_non_negative):
253 | msg = "`p` elements must be non-negative and their sum normalized to 1."
254 | raise ValueError(msg)
255 |
256 |
257 | def _get_n_isotopes_envelope(
258 | isotope: Isotope, n: int, max_length: int
259 | ) -> Tuple[np.ndarray, np.ndarray]:
260 | """
261 | Creates the isotopic envelope for n isotopes.
262 |
263 | aux function to _get_n_atoms_envelope.
264 |
265 | """
266 | M = np.zeros(max_length, dtype=float)
267 | p = np.zeros(max_length, dtype=float)
268 | M[0] = isotope.m * n
269 | p[0] = 1.0
270 | return M, p
271 |
--------------------------------------------------------------------------------
/src/tidyms/chem/atoms.py:
--------------------------------------------------------------------------------
1 | """
2 | Tools for working with Isotopes and Elements.
3 |
4 | Objects
5 | -------
6 | - Element
7 | - Isotope
8 | - PeriodicTable
9 |
10 | Constants
11 | ---------
12 | - EM: Mass of the electron.
13 |
14 | Exceptions
15 | ----------
16 | - InvalidIsotope
17 |
18 | """
19 | import json
20 | import numpy as np
21 | import os.path
22 | from string import digits
23 | from typing import Dict, Final, Tuple, Union
24 |
25 |
26 | EM: Final[float] = 0.00054858 # electron mass
27 |
28 |
29 | class Isotope:
30 | """
31 | Representation of an Isotope.
32 |
33 | Attributes
34 | ----------
35 | z: int
36 | Atomic number
37 | n: int
38 | Neutron number
39 | a: int
40 | Mass number
41 | m: float
42 | Exact mass.
43 | defect: float
44 | Difference between the exact mass and mass number.
45 | abundance: float
46 | Relative abundance of the isotope.
47 |
48 | """
49 |
50 | __slots__ = ("z", "n", "a", "m", "defect", "abundance")
51 |
52 | def __init__(self, z: int, a: int, m: float, abundance: float):
53 | self.z = z
54 | self.n = a - z
55 | self.a = a
56 | self.m = m
57 | self.defect = m - a
58 | self.abundance = abundance
59 |
60 | def __str__(self):
61 | return "{}{}".format(self.a, self.get_symbol())
62 |
63 | def __repr__(self):
64 | return "Isotope({})".format(str(self))
65 |
66 | def get_element(self) -> "Element":
67 | return PeriodicTable().get_element(self.z)
68 |
69 | def get_symbol(self) -> str:
70 | return self.get_element().symbol
71 |
72 |
73 | class Element(object):
74 | """
75 | Representation of a chemical element.
76 |
77 | Attributes
78 | ----------
79 | name : str
80 | Element name.
81 | symbol : str
82 | Element symbol
83 | isotopes : Dict[int, Isotope]
84 | Mapping from mass number to an isotope
85 | z : int
86 | Atomic number.
87 | nominal_mass : int
88 | Mass number of the most abundant isotope
89 |
90 | """
91 |
92 | def __init__(self, symbol: str, name: str, isotopes: Dict[int, Isotope]):
93 | self.name = name
94 | self.symbol = symbol
95 | self.isotopes = isotopes
96 | monoisotope = self.get_monoisotope()
97 | self.z = monoisotope.z
98 | self.nominal_mass = monoisotope.a
99 | self.monoisotopic_mass = monoisotope.m
100 | self.mass_defect = self.monoisotopic_mass - self.nominal_mass
101 |
102 | def __repr__(self):
103 | return "Element({})".format(self.symbol)
104 |
105 | def __str__(self): # pragma: no cover
106 | return self.symbol
107 |
108 | def get_abundances(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
109 | """
110 | Returns the Mass number, exact mass and abundance of each Isotope.
111 |
112 | Returns
113 | -------
114 | m: array[int]
115 | Mass number of each isotope.
116 | M: array[float]
117 | Exact mass of each isotope.
118 | p: array[float]
119 | Abundance of each isotope.
120 |
121 | """
122 | isotopes = list(self.isotopes.values())
123 | m = np.array([x.a for x in isotopes], dtype=int)
124 | M = np.array([x.m for x in isotopes])
125 | p = np.array([x.abundance for x in isotopes])
126 | return m, M, p
127 |
128 | def get_mmi(self) -> Isotope:
129 | """
130 | Returns the isotope with the lowest atomic mass.
131 |
132 | """
133 | return min(self.isotopes.values(), key=lambda x: x.a)
134 |
135 | def get_monoisotope(self) -> Isotope:
136 | """
137 | Returns the most abundant isotope.
138 |
139 | """
140 | return max(self.isotopes.values(), key=lambda x: x.abundance)
141 |
142 |
143 | def PeriodicTable():
144 | """
145 | Reference the PeriodicTable object.
146 |
147 | Examples
148 | --------
149 | >>> import tidyms as ms
150 | >>> ptable = ms.chem.PeriodicTable()
151 |
152 | """
153 | if _PeriodicTable.instance is None:
154 | _PeriodicTable.instance = _PeriodicTable()
155 | return _PeriodicTable.instance
156 |
157 |
158 | class _PeriodicTable:
159 | """
160 | Periodic Table representation. Contains element and isotope information.
161 |
162 | Methods
163 | -------
164 | get_element
165 | get_isotope
166 |
167 | """
168 |
169 | instance = None
170 |
171 | def __init__(self):
172 | self._symbol_to_element = _make_periodic_table()
173 | self._z_to_element = {v.z: v for v in self._symbol_to_element.values()}
174 | self._za_to_isotope = dict()
175 | self._str_to_isotope = dict()
176 | for el_str in self._symbol_to_element:
177 | el = self._symbol_to_element[el_str]
178 | for isotope in el.isotopes.values():
179 | self._za_to_isotope[(isotope.z, isotope.a)] = isotope
180 | self._str_to_isotope[str(isotope.a) + el_str] = isotope
181 |
182 | def get_element(self, element: Union[str, int]) -> Element:
183 | """
184 | Returns an Element object using its symbol or atomic number.
185 |
186 | Parameters
187 | ----------
188 | element : str or int
189 | element symbol or atomic number.
190 |
191 | Returns
192 | -------
193 | Element
194 |
195 | Examples
196 | --------
197 | >>> import tidyms as ms
198 | >>> ptable = ms.chem.PeriodicTable()
199 | >>> h = ptable.get_element("H")
200 | >>> c = ptable.get_element(6)
201 |
202 | """
203 | if isinstance(element, int):
204 | element = self._z_to_element[element]
205 | else:
206 | element = self._symbol_to_element[element]
207 | return element
208 |
209 | def __iter__(self):
210 | for el in self._symbol_to_element.values():
211 | yield el
212 |
213 | def get_isotope(self, x: str, copy: bool = False) -> Isotope:
214 | """
215 | Returns an isotope object from a string representation.
216 |
217 | Parameters
218 | ----------
219 | x : str
220 | A string representation of an isotope. If only the symbol is
221 | provided in the string, the monoisotope is returned.
222 | copy : bool
223 | If True creates a new Isotope object.
224 |
225 | Returns
226 | -------
227 | Isotope
228 |
229 | Examples
230 | --------
231 | >>> import tidyms as ms
232 | >>> ptable = ms.chem.PeriodicTable()
233 | >>> d = ptable.get_isotope("2H")
234 | >>> cl35 = ptable.get_isotope("Cl")
235 |
236 | """
237 | try:
238 | if x[0] in digits:
239 | isotope = self._str_to_isotope[x]
240 | else:
241 | isotope = self.get_element(x).get_monoisotope()
242 | if copy:
243 | isotope = Isotope(isotope.z, isotope.a, isotope.m, isotope.abundance)
244 | return isotope
245 | except KeyError:
246 | msg = "{} is not a valid input.".format(x)
247 | raise InvalidIsotope(msg)
248 |
249 |
250 | def _make_periodic_table() -> Dict[str, Element]:
251 | this_dir, _ = os.path.split(__file__)
252 | elements_path = os.path.join(this_dir, "elements.json")
253 | with open(elements_path, "r") as fin:
254 | element_data = json.load(fin)
255 |
256 | isotopes_path = os.path.join(this_dir, "isotopes.json")
257 | with open(isotopes_path, "r") as fin:
258 | isotope_data = json.load(fin)
259 |
260 | periodic_table = dict()
261 | for element in isotope_data:
262 | element_isotopes = isotope_data[element]
263 | isotopes = {x["a"]: Isotope(**x) for x in element_isotopes}
264 | name = element_data[element]
265 | periodic_table[element] = Element(element, name, isotopes)
266 | return periodic_table
267 |
268 |
269 | class InvalidIsotope(ValueError):
270 | pass
271 |
--------------------------------------------------------------------------------
/src/tidyms/chem/elements.json:
--------------------------------------------------------------------------------
1 | {
2 | "Xx": "Dummy",
3 | "H": "Hydrogen",
4 | "He": "Helium",
5 | "Li": "Lithium",
6 | "Be": "Beryllium",
7 | "B": "Boron",
8 | "C": "Carbon",
9 | "N": "Nitrogen",
10 | "O": "Oxygen",
11 | "F": "Fluorine",
12 | "Ne": "Neon",
13 | "Na": "Sodium",
14 | "Mg": "Magnesium",
15 | "Al": "Aluminium",
16 | "Si": "Silicon",
17 | "P": "Phosphorus",
18 | "S": "Sulfur",
19 | "Cl": "Chlorine",
20 | "Ar": "Argon",
21 | "K": "Potassium",
22 | "Ca": "Calcium",
23 | "Sc": "Scandium",
24 | "Ti": "Titanium",
25 | "V": "Vanadium",
26 | "Cr": "Chromium",
27 | "Mn": "Manganese",
28 | "Fe": "Iron",
29 | "Co": "Cobalt",
30 | "Ni": "Nickel",
31 | "Cu": "Copper",
32 | "Zn": "Zinc",
33 | "Ga": "Gallium",
34 | "Ge": "Germanium",
35 | "As": "Arsenic",
36 | "Se": "Selenium",
37 | "Br": "Bromine",
38 | "Kr": "Krypton",
39 | "Rb": "Rubidium",
40 | "Sr": "Strontium",
41 | "Y": "Yttrium",
42 | "Zr": "Zirconium",
43 | "Nb": "Niobium",
44 | "Mo": "Molybdenum",
45 | "Tc": "Technetium",
46 | "Ru": "Ruthenium",
47 | "Rh": "Rhodium",
48 | "Pd": "Palladium",
49 | "Ag": "Silver",
50 | "Cd": "Cadmium",
51 | "In": "Indium",
52 | "Sn": "Tin",
53 | "Sb": "Antimony",
54 | "Te": "Tellurium",
55 | "I": "Iodine",
56 | "Xe": "Xenon",
57 | "Cs": "Caesium",
58 | "Ba": "Barium",
59 | "La": "Lanthanum",
60 | "Ce": "Cerium",
61 | "Pr": "Praseodymium",
62 | "Nd": "Neodymium",
63 | "Pm": "Promethium",
64 | "Sm": "Samarium",
65 | "Eu": "Europium",
66 | "Gd": "Gadolinium",
67 | "Tb": "Terbium",
68 | "Dy": "Dysprosium",
69 | "Ho": "Holmium",
70 | "Er": "Erbium",
71 | "Tm": "Thulium",
72 | "Yb": "Ytterbium",
73 | "Lu": "Lutetium",
74 | "Hf": "Hafnium",
75 | "Ta": "Tantalum",
76 | "W": "Tungsten",
77 | "Re": "Rhenium",
78 | "Os": "Osmium",
79 | "Ir": "Iridium",
80 | "Pt": "Platinum",
81 | "Au": "Gold",
82 | "Hg": "Mercury",
83 | "Tl": "Thallium",
84 | "Pb": "Lead",
85 | "Bi": "Bismuth",
86 | "Po": "Polonium",
87 | "At": "Astatine",
88 | "Rn": "Radon",
89 | "Fr": "Francium",
90 | "Ra": "Radium",
91 | "Ac": "Actinium",
92 | "Th": "Thorium",
93 | "Pa": "Protactinium",
94 | "U": "Uranium",
95 | "Np": "Neptunium",
96 | "Pu": "Plutonium",
97 | "Am": "Americium",
98 | "Cm": "Curium",
99 | "Bk": "Berkelium",
100 | "Cf": "Californium",
101 | "Es": "Einsteinium",
102 | "Fm": "Fermium",
103 | "Md": "Mendelevium",
104 | "No": "Nobelium",
105 | "Lr": "Lawrencium",
106 | "Rf": "Rutherfordium",
107 | "Db": "Dubnium",
108 | "Sg": "Seaborgium",
109 | "Bh": "Bohrium",
110 | "Hs": "Hassium",
111 | "Mt": "Meitnerium",
112 | "Ds": "Darmstadtium",
113 | "Rg": "Roentgenium",
114 | "Cn": "Copernicium",
115 | "Uut": "Ununtrium",
116 | "Fl": "Flerovium",
117 | "Uup": "Ununpentium",
118 | "Lv": "Livermorium",
119 | "Uus": "Ununseptium",
120 | "Uuo": "Ununoctium"
121 | }
--------------------------------------------------------------------------------
/src/tidyms/chem/utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | functions and classes used in different modules
4 | """
5 |
6 | import numpy as np
7 |
8 |
9 | def cartesian_product(*args):
10 | res = None
11 | for x in args:
12 | if res is None:
13 | # initialize cartesian product array
14 | res = np.array(x)
15 | res = res.reshape((res.size, 1))
16 | else:
17 | x = np.array(x)
18 | row, col = res.shape
19 | new_res_shape = (row * x.size, col + 1)
20 | new_res = np.zeros(shape=new_res_shape, dtype=res.dtype)
21 | ind = np.repeat(np.arange(row), x.size)
22 | new_col = np.tile(x, row)
23 | new_res[:, :col] = res[ind]
24 | new_res[:, -1] = new_col
25 | res = new_res
26 | return res
27 |
--------------------------------------------------------------------------------
/test_requirements.txt:
--------------------------------------------------------------------------------
1 | pytest>=7.1.0
2 | pytest-cov>=3.0.0
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/griquelme/tidyms/ad9356a099f367076f745406be23bb4c50003239/tests/__init__.py
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | from tidyms.simulation import simulate_dataset
4 | from tidyms.container import DataContainer
5 | from tidyms import fileio
6 | from tidyms.utils import get_tidyms_path
7 | import numpy as np
8 | import pytest
9 | import os
10 |
11 |
12 | # simulated data used for tests
13 | def pytest_sessionstart(session):
14 | for dataset in fileio.list_available_datasets(False):
15 | fileio.download_dataset(dataset)
16 |
17 |
18 | @pytest.fixture
19 | def data_container_with_order():
20 | population = {"healthy": 20, "disease": 35}
21 | mean = {"healthy": np.array([50, 100, 150]), "disease": np.array([150, 200, 300])}
22 | cov = {"healthy": np.array([1, 1, 1]), "disease": np.array([2, 2, 2])}
23 | blank_contribution = np.array([3, 5, 10])
24 | mz = np.array([100, 200, 300])
25 | rt = np.array([50, 60, 70])
26 | dc = simulate_dataset(
27 | population, mean, cov, mz, rt, blank_contribution, prepend_blank=1, append_blank=1
28 | )
29 | return dc
30 |
31 |
32 | @pytest.fixture
33 | def data_container_with_order_single_qc():
34 | population = {"healthy": 20, "disease": 35}
35 | mean = {"healthy": np.array([50, 100, 150]), "disease": np.array([150, 200, 300])}
36 | cov = {"healthy": np.array([1, 1, 1]), "disease": np.array([2, 2, 2])}
37 | blank_contribution = np.array([3, 5, 10])
38 | mz = np.array([100, 200, 300])
39 | rt = np.array([50, 60, 70])
40 | dc = simulate_dataset(
41 | population,
42 | mean,
43 | cov,
44 | mz,
45 | rt,
46 | blank_contribution,
47 | prepend_blank=1,
48 | append_blank=1,
49 | triple_qc=False,
50 | )
51 | return dc
52 |
53 |
54 | @pytest.fixture
55 | def data_container_without_order(data_container_with_order):
56 | dc = data_container_with_order
57 | dm = dc.data_matrix.copy()
58 | sm = dc.sample_metadata.copy()
59 | sm.pop("order")
60 | sm.pop("batch")
61 | fm = dc.feature_metadata.copy()
62 | mapping = {k: v for k, v in dc.mapping.items() if v is not None}
63 | return DataContainer(dm, fm, sm, mapping)
64 |
65 |
66 | @pytest.fixture
67 | def centroid_mzml():
68 | cache_path = get_tidyms_path()
69 | dataset_name = "test-raw-data"
70 | filename = "centroid-data-zlib-indexed-compressed.mzML"
71 | data_path = os.path.join(cache_path, dataset_name, filename)
72 | ms_data = fileio.MSData.create_MSData_instance(data_path, ms_mode="profile")
73 | return ms_data
74 |
75 |
76 | @pytest.fixture
77 | def profile_mzml():
78 | cache_path = get_tidyms_path()
79 | filename = "profile-data-zlib-indexed-compressed.mzML"
80 | data_path = os.path.join(cache_path, "test-raw-data", filename)
81 | ms_data = fileio.MSData.create_MSData_instance(data_path, ms_mode="profile")
82 | return ms_data
83 |
--------------------------------------------------------------------------------
/tests/integration/test_assay_real_data.py:
--------------------------------------------------------------------------------
1 | import tidyms as ms
2 | import numpy as np
3 | import pytest
4 | from pathlib import Path
5 |
6 |
7 | @pytest.fixture
8 | def assay(tmpdir) -> ms.Assay:
9 | tidyms_path = ms.fileio.get_tidyms_path()
10 | data_path = Path(tidyms_path).joinpath("test-nist-raw-data")
11 | assay_path = Path(tmpdir).joinpath("test-assay")
12 | return ms.Assay(assay_path, data_path)
13 |
14 |
15 | @pytest.fixture
16 | def detect_features_params() -> dict:
17 | # a list of known m/z values to reduce computing time
18 | mz_list = np.array([144.081, 146.060, 195.086, 189.0734, 205.0967, 188.071])
19 | return {
20 | "tolerance": 0.015,
21 | "min_intensity": 5000,
22 | "targeted_mz": mz_list,
23 | }
24 |
25 |
26 | @pytest.fixture
27 | def extract_features_params() -> dict:
28 | return {"store_smoothed": True}
29 |
30 |
31 | def test_detect_features(assay, detect_features_params):
32 | assay.detect_features(**detect_features_params)
33 | assert True
34 |
35 |
36 | def test_extract_features(
37 | assay,
38 | detect_features_params,
39 | extract_features_params
40 | ):
41 | assay.detect_features(**detect_features_params)
42 | assay.extract_features(**extract_features_params)
43 | assert True
44 |
45 |
46 | def test_describe_features(
47 | assay,
48 | detect_features_params,
49 | extract_features_params
50 | ):
51 | assay.detect_features(**detect_features_params)
52 | assay.extract_features(**extract_features_params)
53 | assay.describe_features()
54 | assert True
55 |
56 |
57 | def test_build_feature_table(
58 | assay,
59 | detect_features_params,
60 | extract_features_params
61 | ):
62 | assay.detect_features(**detect_features_params)
63 | assay.extract_features(**extract_features_params)
64 | assay.describe_features()
65 | assay.build_feature_table()
66 | assert True
67 |
68 |
69 | def test_match_features(
70 | assay,
71 | detect_features_params,
72 | extract_features_params
73 | ):
74 | assay.detect_features(**detect_features_params)
75 | assay.extract_features(**extract_features_params)
76 | assay.describe_features()
77 | assay.build_feature_table()
78 | assay.match_features()
79 | assert True
80 |
81 |
82 | def test_build_data_matrix(
83 | assay,
84 | detect_features_params,
85 | extract_features_params
86 | ):
87 | assay.detect_features(**detect_features_params)
88 | assay.extract_features(**extract_features_params)
89 | assay.describe_features()
90 | assay.build_feature_table()
91 | assay.match_features()
92 | assay.make_data_matrix()
93 | assert True
--------------------------------------------------------------------------------
/tests/integration/test_real_raw_data.py:
--------------------------------------------------------------------------------
1 | """
2 | Test lcms and fileio functionality with real data.
3 |
4 | """
5 |
6 | import tidyms as ms
7 | import numpy as np
8 | import pytest
9 | import os
10 |
11 |
12 | @pytest.fixture
13 | def ms_data_centroid() -> ms.MSData:
14 | tidyms_path = ms.fileio.get_tidyms_path()
15 | filename = "centroid-data-zlib-indexed-compressed.mzML"
16 | data_path = os.path.join(tidyms_path, "test-raw-data", filename)
17 | return ms.MSData.create_MSData_instance(data_path)
18 |
19 |
20 | def test_ms_data_invalid_ms_mode_setter(ms_data_centroid):
21 | with pytest.raises(ValueError):
22 | ms_data_centroid.ms_mode = "invalid-mode"
23 |
24 |
25 | def test_ms_data_invalid_instrument_setter(ms_data_centroid):
26 | with pytest.raises(ValueError):
27 | ms_data_centroid.instrument = "invalid-instrument"
28 |
29 |
30 | def test_ms_data_invalid_separation_setter(ms_data_centroid):
31 | with pytest.raises(ValueError):
32 | ms_data_centroid.separation = "invalid-separation"
33 |
34 |
35 | def test_make_chromatogram_ms_level_1(ms_data_centroid):
36 | mz = np.array([205.098, 524.37, 188.07]) # some m/z observed in the data
37 | chromatograms = ms.make_chromatograms(ms_data_centroid, mz)
38 | rt = list()
39 | for _, sp in ms_data_centroid.get_spectra_iterator(ms_level=1):
40 | rt.append(sp.time)
41 | rt = np.array(rt)
42 | for c in chromatograms:
43 | assert np.array_equal(rt, c.time)
44 | assert c.time.size == c.spint.size
45 |
46 |
47 | def test_ms_data_get_spectrum(ms_data_centroid):
48 | ms_data_centroid.get_spectrum(0)
49 | assert True
50 |
51 |
52 | def test_make_tic_ms_level_1(ms_data_centroid):
53 | tic = ms.make_tic(ms_data_centroid, ms_level=1)
54 | rt = list()
55 | for _, sp in ms_data_centroid.get_spectra_iterator(ms_level=1):
56 | rt.append(sp.time)
57 | rt = np.array(rt)
58 | assert np.array_equal(rt, tic.time)
59 | assert tic.time.size == tic.spint.size
60 |
61 |
62 | def test_make_chromatogram_ms_level_2(ms_data_centroid):
63 | mz = np.array([205.098, 524.37, 188.07]) # some m/z observed in the data
64 | ms_level = 2
65 | chromatograms = ms.make_chromatograms(
66 | ms_data_centroid, mz, ms_level=ms_level)
67 | rt = list()
68 | for _, sp in ms_data_centroid.get_spectra_iterator(ms_level=ms_level):
69 | rt.append(sp.time)
70 | rt = np.array(rt)
71 | for c in chromatograms:
72 | assert np.array_equal(rt, c.time)
73 | assert c.time.size == c.spint.size
74 |
75 |
76 | def test_make_roi(ms_data_centroid):
77 | roi_list = ms.make_roi(ms_data_centroid)
78 | for r in roi_list:
79 | # The three arrays must have the same size
80 | assert r.time.size == r.spint.size
81 | assert r.time.size == r.scan.size
82 |
83 |
84 | def test_accumulate_spectra(ms_data_centroid):
85 | sp = ms.accumulate_spectra(ms_data_centroid, start_time=20, end_time=30)
86 | assert sp.mz.size == sp.spint.size
87 |
--------------------------------------------------------------------------------
/tests/unit/annotation/test_annotation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pytest
3 |
4 | from tidyms.annotation import annotation
5 | from tidyms.raw_data_utils import make_roi
6 | from tidyms.fileio import MSData_simulated
7 | from tidyms.lcms import Peak
8 | from tidyms.chem import Formula
9 |
10 |
11 | @pytest.fixture
12 | def annotation_tools_params():
13 | bounds = {
14 | "C": (0, 50),
15 | "H": (0, 100),
16 | "O": (0, 20),
17 | "N": (0, 20),
18 | "Cl": (0, 2),
19 | "B": (0, 1),
20 | }
21 | params = {
22 | "bounds": bounds,
23 | "max_mass": 2500,
24 | "max_length": 10,
25 | "max_charge": 3,
26 | "min_M_tol": 0.005,
27 | "max_M_tol": 0.01,
28 | "p_tol": 0.05,
29 | "min_similarity": 0.9,
30 | "min_p": 0.01,
31 | }
32 | return params
33 |
34 |
35 | def test__annotate_empty_feature_list(annotation_tools_params):
36 | tools = annotation.create_annotation_tools(**annotation_tools_params)
37 | feature_list = list()
38 | annotation.annotate(feature_list, *tools)
39 |
40 |
41 | @pytest.fixture
42 | def compound_data():
43 | compounds = [
44 | "[C10H20O2]-",
45 | "[C10H20BO3]-",
46 | "[C20H40BO5]2-",
47 | "[C18H19N2O3]-",
48 | "[C18H20N2O3Cl]-",
49 | "[C10H20Cl]-",
50 | ]
51 | rt_list = [50, 75, 150, 200, 200, 175]
52 | amp_list = [10000, 20000, 30000, 25000, 25000, 20000]
53 | return compounds, rt_list, amp_list
54 |
55 |
56 | @pytest.fixture
57 | def feature_list(compound_data) -> list[Peak]:
58 | compounds, rt_list, amp_list = compound_data
59 | mz_grid = np.linspace(100, 1200, 20000)
60 | rt_grid = np.arange(300)
61 | rt_params = list()
62 | mz_params = list()
63 | width = 4
64 | for comp, c_amp, c_rt in zip(compounds, amp_list, rt_list):
65 | f = Formula(comp)
66 | cM, cp = f.get_isotopic_envelope(4)
67 | cmz = [[x, y] for x, y in zip(cM, cp)]
68 | crt = [[c_rt, width, c_amp] for _ in cM]
69 | rt_params.append(crt)
70 | mz_params.append(cmz)
71 | mz_params = np.vstack(mz_params)
72 | rt_params = np.vstack(rt_params)
73 | ms_data = MSData_simulated(mz_grid, rt_grid, mz_params, rt_params, noise=0.025)
74 |
75 | roi_list = make_roi(ms_data, tolerance=0.01)
76 | ft_list = list()
77 | for k, r in enumerate(roi_list):
78 | r.extract_features()
79 | r.index = k
80 | if r.features:
81 | for j, ft in enumerate(r.features):
82 | ft.index = j
83 | ft_list.extend(r.features)
84 | return ft_list
85 |
86 |
87 | def test_annotate(feature_list, annotation_tools_params):
88 | tools = annotation.create_annotation_tools(**annotation_tools_params)
89 | annotation.annotate(feature_list, *tools)
90 |
91 | # group features by isotopologue label.
92 | annotation_check = dict()
93 | for ft in feature_list:
94 | group_list = annotation_check.setdefault(ft.annotation.isotopologue_label, list())
95 | group_list.append(ft)
96 | annotation_check.pop(-1)
97 | assert len(annotation_check) == 6
98 | for v in annotation_check.values():
99 | assert len(v) == 4 # features where generated with 4 isotopologues.
100 |
--------------------------------------------------------------------------------
/tests/unit/annotation/test_envelope_finder.py:
--------------------------------------------------------------------------------
1 | from tidyms.annotation import envelope_finder as ef
2 | from tidyms.annotation.annotation_data import AnnotationData
3 | from tidyms.chem import PeriodicTable
4 | from tidyms.chem import Formula
5 | from tidyms.lcms import LCTrace, Peak
6 | import pytest
7 | import numpy as np
8 | from collections.abc import Sequence
9 |
10 |
11 | @pytest.fixture
12 | def formulas():
13 | formulas = {
14 | "cho": [
15 | "C27H34O9",
16 | "C62H120O6",
17 | "C59H114O6",
18 | "C62H120O6",
19 | "C56H42O10",
20 | "C17H20O4",
21 | "C54H104O6",
22 | "C48H92O6",
23 | "C52H100O6",
24 | "C54H104O6",
25 | "C47H90O6",
26 | "C50H96O6",
27 | "C56H108O6",
28 | "C21H19O13",
29 | "C57H94O6",
30 | "C58H112O6",
31 | "C64H124O6",
32 | "C24H20O8",
33 | "C17H12O6",
34 | "C61H118O6",
35 | "C47H90O6",
36 | "C6H12O6",
37 | "C63H106O6",
38 | "C40H52O4",
39 | "C61H118O6",
40 | "C61H118O6",
41 | "C57H96O6",
42 | "C37H72O5",
43 | "C28H44O2",
44 | "C29H24O12",
45 | "C51H98O6",
46 | "C39H72O5",
47 | "C46H78O7",
48 | "C54H104O6",
49 | "C63H110O6",
50 | "C21H18O13",
51 | "C53H102O6",
52 | "C62H120O6",
53 | "C59H114O6",
54 | "C41H78O6",
55 | "C25H30O6",
56 | "C51H98O6",
57 | "C53H102O6",
58 | "C43H68O13",
59 | "C37H72O5",
60 | "C59H114O6",
61 | "C15H12O4",
62 | "C16H18O4",
63 | "C61H110O6",
64 | "C58H112O6",
65 | ],
66 | "chnops": [
67 | "C41H80NO8P",
68 | "C54H104O6",
69 | "C27H40O2",
70 | "C24H26O12",
71 | "C55H106O6",
72 | "C45H80O16P2",
73 | "C50H96O6",
74 | "C8H13NO",
75 | "C35H36O15",
76 | "C48H92O6",
77 | "C63H98O6",
78 | "C15H14O5",
79 | "C18H23N3O6",
80 | "C44H80NO8P",
81 | "C47H90O6",
82 | "C47H84O16P2",
83 | "C14H14O4",
84 | "C46H80NO10P",
85 | "C35H64O9",
86 | "C51H98O6",
87 | "C6H12O6",
88 | "C26H34O7",
89 | "C17H18O4",
90 | "C6H8O9S",
91 | "C63H100O6",
92 | "C51H98O6",
93 | "C6H12O",
94 | "C50H96O6",
95 | "C56H108O6",
96 | "C61H114O6",
97 | "C57H110O6",
98 | "C44H76NO8P",
99 | "C63H110O6",
100 | "C41H71O8P",
101 | "C16H16O10",
102 | "C21H20O15",
103 | "C4H6O3",
104 | "C16H18O9",
105 | "C51H98O6",
106 | "C57H94O6",
107 | "C4H9NO2",
108 | "C56H108O6",
109 | "C6H8O7",
110 | "C57H98O6",
111 | "C63H110O6",
112 | "C58H112O6",
113 | "C12H16O7S",
114 | "C27H30O12",
115 | "C26H28O16",
116 | "C27H38O12",
117 | ],
118 | }
119 | return formulas
120 |
121 |
122 | @pytest.fixture
123 | def elements():
124 | elements = {"cho": ["C", "H", "O"], "chnops": ["C", "H", "N", "O", "P", "S"]}
125 | return elements
126 |
127 |
128 | def create_feature_list_from_formula(f_str: str) -> Sequence[Peak]:
129 | f = Formula(f_str)
130 | M, _ = f.get_isotopic_envelope()
131 | if f.charge:
132 | mz = M / abs(f.charge)
133 | else:
134 | mz = M
135 | feature_list = list()
136 | for k_mz in mz:
137 | size = 30
138 | time = np.linspace(0, size, size)
139 | scan = np.arange(size)
140 | spint = np.ones(size)
141 | roi = LCTrace(time, spint, spint * k_mz, scan)
142 | peak = Peak(10, 15, 20, roi)
143 | feature_list.append(peak)
144 | return feature_list
145 |
146 |
147 | @pytest.mark.parametrize("element_set", ["cho", "chnops"])
148 | def test__make_exact_mass_difference_bounds(elements, element_set):
149 | # test bounds for different element combinations
150 | elements = elements[element_set]
151 | elements = [PeriodicTable().get_element(x) for x in elements]
152 | bounds = ef._make_exact_mass_difference_bounds(elements, 0.0)
153 | # m and M are the bounds for each nominal mass increment
154 | for e in elements:
155 | nom, ex, ab = e.get_abundances()
156 | nom = nom - nom[0]
157 | ex = ex - ex[0]
158 | for i, mi in zip(nom[1:], ex[1:]):
159 | m_min, m_max = bounds[i]
160 | assert m_min <= mi
161 | assert m_max >= mi
162 |
163 |
164 | @pytest.mark.parametrize("element_set", ["cho", "chnops"])
165 | def test__get_next_mz_search_interval_mz(elements, formulas, element_set):
166 | elements = elements[element_set]
167 | elements = [PeriodicTable().get_element(x) for x in elements]
168 | dM_bounds = ef._make_exact_mass_difference_bounds(elements, 0.0)
169 | # test bounds for different formulas
170 | for f_str in formulas[element_set]:
171 | feature_list = create_feature_list_from_formula(f_str)
172 | length = len(feature_list)
173 | for k in range(1, length - 1):
174 | k_ft = feature_list[k]
175 | min_mz, max_mz = ef._get_next_mz_search_interval(
176 | feature_list[:k], dM_bounds, 1, 0.005
177 | )
178 | assert (min_mz < k_ft.mz) and (k_ft.mz < max_mz)
179 |
180 |
181 | @pytest.mark.parametrize("charge", list(range(1, 6)))
182 | def test_get_k_bounds_multiple_charges(elements, formulas, charge):
183 | elements = elements["chnops"]
184 | formulas = formulas["chnops"]
185 | elements = [PeriodicTable().get_element(x) for x in elements]
186 | bounds = ef._make_exact_mass_difference_bounds(elements, 0.0)
187 | for f_str in formulas:
188 | features = create_feature_list_from_formula(f"[{f_str}]{charge}+")
189 | length = len(features)
190 | for k in range(1, length - 1):
191 | m_min, m_max = ef._get_next_mz_search_interval(
192 | features[:k], bounds, charge, 0.005
193 | )
194 | assert (m_min < features[k]) and (features[k] < m_max)
195 |
196 |
197 | @pytest.mark.parametrize(
198 | "elements_set,charge", [["cho", 1], ["cho", 2], ["chnops", 1], ["chnops", 2]]
199 | )
200 | def test__find_envelopes(formulas, elements, elements_set, charge):
201 | # test that the function works using as a list m/z values generated from
202 | # formulas.
203 | elements = elements[elements_set]
204 | formulas = formulas[elements_set]
205 | elements = [PeriodicTable().get_element(x) for x in elements]
206 | bounds = ef._make_exact_mass_difference_bounds(elements, 0.0)
207 | max_length = 10
208 | mz_tol = 0.005
209 | min_similarity = 0.9
210 | for f_str in formulas:
211 | f_str = f"[{f_str}]{charge}+"
212 | features = create_feature_list_from_formula(f_str)
213 | data = AnnotationData(features)
214 | mmi = data.features[0]
215 | results = ef._find_envelopes(
216 | data.features,
217 | mmi,
218 | data.non_annotated,
219 | data.similarity_cache,
220 | charge,
221 | max_length,
222 | mz_tol,
223 | min_similarity,
224 | bounds,
225 | )
226 | expected = features
227 | assert results[0] == expected
228 |
229 |
230 | @pytest.mark.parametrize("elements_set", ["cho", "chnops"])
231 | def test__find_envelopes_no_charge(formulas, elements, elements_set):
232 | # test that the function works using as a list m/z values generated from
233 | # formulas.
234 | elements = elements[elements_set]
235 | formulas = formulas[elements_set]
236 | elements = [PeriodicTable().get_element(x) for x in elements]
237 | bounds = ef._make_exact_mass_difference_bounds(elements, 0.0)
238 | max_length = 10
239 | charge = 0
240 | mz_tol = 0.005
241 | min_similarity = 0.9
242 | for f_str in formulas:
243 | features = create_feature_list_from_formula(f_str)
244 | data = AnnotationData(features)
245 | mmi = features[0]
246 | results = ef._find_envelopes(
247 | features,
248 | mmi,
249 | data.non_annotated,
250 | data.similarity_cache,
251 | charge,
252 | max_length,
253 | mz_tol,
254 | min_similarity,
255 | bounds,
256 | )
257 | expected = features
258 | assert results[0] == expected
259 |
260 |
261 | def test_EnvelopeFinder(elements, formulas):
262 | elements = elements["chnops"]
263 | formulas = formulas["chnops"]
264 | envelope_finder = ef.EnvelopeFinder(elements, 0.005, max_length=10)
265 | charge = 1
266 | for f_str in formulas:
267 | features = create_feature_list_from_formula(f_str)
268 | mmi = features[0]
269 | data = AnnotationData(features)
270 | results = envelope_finder.find(data, mmi, charge)
271 | expected = features
272 | assert len(results) == 1
273 | assert results[0] == expected
274 |
--------------------------------------------------------------------------------
/tests/unit/annotation/test_mmi_finder.py:
--------------------------------------------------------------------------------
1 | from tidyms.annotation import mmi_finder
2 | from tidyms.annotation.annotation_data import AnnotationData
3 | from tidyms.chem import PeriodicTable
4 | from tidyms.lcms import LCTrace, Peak
5 | import pytest
6 | import numpy as np
7 | from typing import Sequence
8 |
9 |
10 | def test__select_two_isotope_elements_dm_1_p0_greater_than_pi():
11 | elements = ["C", "H", "N", "O", "P", "S"]
12 | expected = ["C"]
13 | custom_abundances = dict()
14 | dm = 1
15 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
16 | assert len(res) == len(expected)
17 | assert set(res) == set(expected)
18 |
19 |
20 | def test__select_two_isotope_elements_dm_1_p0_greater_than_pi_custom_abundance():
21 | elements = ["C", "H", "N", "O", "P", "S"]
22 | expected = ["H"]
23 | custom_abundances = {"H": np.array([0.95, 0.05])}
24 | dm = 1
25 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
26 | assert len(res) == len(expected)
27 | assert set(res) == set(expected)
28 |
29 |
30 | def test__select_two_isotope_elements_dm_1_no_elements():
31 | elements = ["O", "P", "S"]
32 | custom_abundances = {}
33 | dm = 1
34 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
35 | assert len(res) == 0
36 |
37 |
38 | def test__select_two_isotope_elements_dm_1_p0_lower_than_pi():
39 | elements = ["B", "Li", "O", "P", "S"]
40 | expected = ["B", "Li"]
41 | dm = 1
42 | custom_abundances = dict()
43 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
44 | assert len(res) == len(expected)
45 | assert set(res) == set(expected)
46 |
47 |
48 | def test__select_two_isotope_elements_dm_1_p0_lower_and_higher_than_pi():
49 | elements = ["C", "H", "B", "Li", "O", "P", "S"]
50 | expected = ["C", "B", "Li"]
51 | dm = 1
52 | custom_abundances = dict()
53 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
54 | assert len(res) == len(expected)
55 | assert set(res) == set(expected)
56 |
57 |
58 | def test__select_two_isotope_elements_dm_2_p0_greater_than_pi():
59 | elements = ["Cl", "H", "N", "O", "P", "S"]
60 | expected = ["Cl"]
61 | custom_abundances = dict()
62 | dm = 2
63 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
64 | assert len(res) == len(expected)
65 | assert set(res) == set(expected)
66 |
67 |
68 | def test__select_two_isotope_elements_dm_2_p0_greater_than_pi_custom_abundance():
69 | elements = ["Cl", "Br", "N", "O", "P", "S"]
70 | expected = ["Cl"]
71 | # Br abundance adjusted to force the result to be Cl
72 | custom_abundances = {"Br": np.array([0.9, 0.1])}
73 | dm = 2
74 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
75 | assert len(res) == len(expected)
76 | assert set(res) == set(expected)
77 |
78 |
79 | def test__select_two_isotope_elements_dm_2_no_elements():
80 | elements = ["O", "P", "S"]
81 | custom_abundances = {}
82 | dm = 2
83 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
84 | assert len(res) == 0
85 |
86 |
87 | def test__select_two_isotope_elements_dm_2_p0_lower_than_pi():
88 | elements = ["In", "H", "O", "P", "S"]
89 | expected = ["In"]
90 | dm = 2
91 | custom_abundances = dict()
92 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
93 | assert len(res) == len(expected)
94 | assert set(res) == set(expected)
95 |
96 |
97 | def test__select_two_isotope_elements_dm_2_p0_lower_and_higher_than_pi():
98 | elements = ["Cl", "In", "Br", "O", "P", "S"]
99 | expected = ["Br", "In"]
100 | dm = 2
101 | custom_abundances = dict()
102 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
103 | assert len(res) == len(expected)
104 | assert set(res) == set(expected)
105 |
106 |
107 | def test__select_multiple_isotope_elements():
108 | elements = ["Cl", "H", "N", "O", "P", "S"]
109 | expected = ["O", "S"]
110 | res = mmi_finder._select_multiple_isotope_elements(elements)
111 | assert len(res) == len(expected)
112 | assert set(res) == set(expected)
113 |
114 |
115 | def test__select_multiple_isotope_elements_no_elements():
116 | elements = ["Cl", "H", "N", "P"]
117 | expected = []
118 | res = mmi_finder._select_multiple_isotope_elements(elements)
119 | assert len(res) == len(expected)
120 | assert set(res) == set(expected)
121 |
122 |
123 | @pytest.mark.parametrize(
124 | "elements,expected",
125 | [
126 | [["C", "H", "N", "O", "P", "S"], ["C", "O", "S"]],
127 | [["C", "H", "N", "O", "P", "S", "Cl", "Li", "Na"], ["C", "O", "S", "Li", "Cl"]],
128 | ],
129 | )
130 | def test__select_elements(elements, expected):
131 | res = mmi_finder._select_elements(elements)
132 | res = [x.symbol for x in res]
133 | assert len(res) == len(expected)
134 | assert set(res) == set(expected)
135 |
136 |
137 | @pytest.fixture
138 | def rules():
139 | bounds = {"C": (0, 108), "H": (0, 100), "S": (0, 8), "Cl": (0, 2)}
140 | max_mass = 2000.0
141 | length = 5
142 | bin_size = 100
143 | p_tol = 0.05
144 | r = mmi_finder._create_rules_dict(bounds, max_mass, length, bin_size, p_tol, None)
145 | return r, max_mass, length, bin_size
146 |
147 |
148 | def create_peak_list(mz: list[float], sp: list[float]) -> Sequence[Peak]:
149 | peak_list = list()
150 | size = 30
151 | time = np.linspace(0, size, size)
152 | scan = np.arange(size)
153 | spint = np.ones(size)
154 | for k_mz, k_sp in zip(mz, sp):
155 | roi = LCTrace(time.copy(), spint * k_sp, spint * k_mz, scan)
156 | peak = Peak(10, 15, 20, roi)
157 | peak_list.append(peak)
158 | return peak_list
159 |
160 |
161 | def test__find_candidates(rules):
162 | rules, max_mass, length, bin_size = rules
163 | # create an m/z and sp list where the monoisotopic m/z is the M1 in the
164 | # isotopic envelope.
165 |
166 | _, M_cl, _ = PeriodicTable().get_element("Cl").get_abundances()
167 | dm_cl = M_cl[1] - M_cl[0]
168 | mono_mz = 400.0
169 | charge = 1
170 | mono_index = 3
171 | mz = [100.0, 300.0, mono_mz - dm_cl, mono_mz, 456.0]
172 | sp = [100.0, 200.0, 500.0, 501.0, 34.0]
173 | peak_list = create_peak_list(mz, sp)
174 | monoisotopologue = peak_list[mono_index]
175 |
176 | # find the rule to search the mmi candidate
177 | m_bin = int(mono_mz // bin_size)
178 | i_rules = rules.get(m_bin)[0]
179 | mz_tol = 0.005
180 | p_tol = 0.05
181 | min_similarity = 0.9
182 |
183 | data = AnnotationData(peak_list)
184 |
185 | test_candidates = mmi_finder._find_candidate(
186 | data, monoisotopologue, charge, i_rules, mz_tol, p_tol, max_mass, min_similarity
187 | )
188 | mmi = peak_list[2]
189 | expected_candidates = [(mmi, 1)]
190 | assert test_candidates == expected_candidates
191 |
192 |
193 | def test__find_candidates_multiple_candidates(rules):
194 | rules, max_mass, length, bin_size = rules
195 | # create an m/z and sp list where the monoisotopic m/z is the M1 in the
196 | # isotopic envelope.
197 | _, M_cl, _ = PeriodicTable().get_element("Cl").get_abundances()
198 | dm_cl = M_cl[1] - M_cl[0]
199 | mono_mz = 400.0
200 | charge = 1
201 | mono_index = 4
202 | M01 = mono_mz - dm_cl
203 | M02 = M01 + 0.00001
204 | mz = [100.0, 300.0, M01, M02, mono_mz, 456.0]
205 | sp = [100.0, 200.0, 500.0, 500.5, 501.0, 34.0]
206 | peak_list = create_peak_list(mz, sp)
207 | monoisotopologue = peak_list[mono_index]
208 |
209 | # find the rule to search the mmi candidate
210 | m_bin = int(mono_mz // bin_size)
211 | i_rules = rules.get(m_bin)[0]
212 | mz_tol = 0.005
213 | p_tol = 0.05
214 | min_similarity = 0.9
215 |
216 | data = AnnotationData(peak_list)
217 |
218 | test_candidates = mmi_finder._find_candidate(
219 | data, monoisotopologue, charge, i_rules, mz_tol, p_tol, max_mass, min_similarity
220 | )
221 | expected_candidates = [(peak_list[2], 1), (peak_list[3], 1)]
222 | assert test_candidates == expected_candidates
223 |
224 |
225 | def test__find_candidates_no_candidates(rules):
226 | rules, max_mass, length, bin_size = rules
227 | # create an m/z and sp list where the monoisotopic m/z is the M1 in the
228 | # isotopic envelope.
229 | _, M_cl, _ = PeriodicTable().get_element("Cl").get_abundances()
230 | mono_mz = 400.0
231 | charge = 1
232 | mono_index = 2
233 | mz = [100.0, 300.0, mono_mz, 456.0]
234 | sp = [100.0, 200.0, 501.0, 34.0]
235 | peak_list = create_peak_list(mz, sp)
236 | monoisotopologue = peak_list[mono_index]
237 |
238 | # find the rule to search the mmi candidate
239 | m_bin = int(mono_mz // bin_size)
240 | i_rules = rules.get(m_bin)[0]
241 | mz_tol = 0.005
242 | p_tol = 0.05
243 | min_similarity = 0.9
244 |
245 | data = AnnotationData(peak_list)
246 |
247 | test_candidates = mmi_finder._find_candidate(
248 | data, monoisotopologue, charge, i_rules, mz_tol, p_tol, max_mass, min_similarity
249 | )
250 | assert len(test_candidates) == 0
251 |
252 |
253 | def test_MMIFinder():
254 | bounds = {"C": (0, 108), "H": (0, 100), "S": (0, 8), "Cl": (0, 2)}
255 | max_mass = 2000.0
256 | length = 5
257 | bin_size = 100
258 | max_charge = 3
259 | mz_tol = 0.005
260 | p_tol = 0.05
261 | min_similarity = 0.9
262 | finder = mmi_finder.MMIFinder(
263 | bounds, max_mass, max_charge, length, bin_size, mz_tol, p_tol, min_similarity
264 | )
265 |
266 | _, M_cl, _ = PeriodicTable().get_element("Cl").get_abundances()
267 | dm_cl = M_cl[1] - M_cl[0]
268 | mono_mz = 400.0
269 | mz = [100.0, 300.0, mono_mz - dm_cl, mono_mz, 456.0]
270 | sp = [100.0, 200.0, 500.0, 501.0, 34.0]
271 | peak_list = create_peak_list(mz, sp)
272 | data = AnnotationData(peak_list)
273 | monoisotopologue = data.get_monoisotopologue()
274 | test_mmi_index = finder.find(data)
275 | expected_mmi_index = [
276 | (monoisotopologue, 1),
277 | (monoisotopologue, 2),
278 | (monoisotopologue, 3),
279 | (peak_list[2], 1),
280 | ]
281 | # check with set because features may be in a different order
282 | assert set(test_mmi_index) == set(expected_mmi_index)
283 |
--------------------------------------------------------------------------------
/tests/unit/test_batch_corrector.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from tidyms import _batch_corrector
3 | # import pytest
4 | from statsmodels.nonparametric.smoothers_lowess import lowess
5 |
6 |
7 | def test_correct_batches(data_container_with_order):
8 | data_matrix = data_container_with_order.data_matrix
9 | sample_metadata = data_container_with_order.sample_metadata
10 | sample_class = ["healthy", "disease"]
11 | qc_class = ["QC"]
12 | _batch_corrector.correct_batches(
13 | data_matrix,
14 | sample_metadata,
15 | sample_class,
16 | qc_class,
17 | verbose=False
18 | )
19 | assert True
20 |
21 |
22 | def test_correct_batches_frac(data_container_with_order):
23 | # test specifying a frac value
24 | data_matrix = data_container_with_order.data_matrix
25 | sample_metadata = data_container_with_order.sample_metadata
26 | sample_class = ["healthy", "disease"]
27 | qc_class = ["QC"]
28 | _batch_corrector.correct_batches(
29 | data_matrix,
30 | sample_metadata,
31 | sample_class,
32 | qc_class,
33 | frac=0.7,
34 | verbose=False
35 | )
36 | assert True
37 |
38 |
39 | def test_correct_batches_first_n(data_container_with_order):
40 | # test specifying a frac value
41 | data_matrix = data_container_with_order.data_matrix
42 | sample_metadata = data_container_with_order.sample_metadata
43 | sample_class = ["healthy", "disease"]
44 | qc_class = ["QC"]
45 | _batch_corrector.correct_batches(
46 | data_matrix,
47 | sample_metadata,
48 | sample_class,
49 | qc_class,
50 | first_n=1,
51 | verbose=False
52 | )
53 | assert True
54 |
55 |
56 | def test_lowess_min_n_samples():
57 | # check that the if n is lower or equal than 3 lowess return the same value
58 | n = 4
59 | for k in range(2, n):
60 | x = np.arange(k)
61 | y = np.random.normal(size=k)
62 | y_fit = lowess(y, x, is_sorted=True, return_sorted=False)
63 | assert np.allclose(y, y_fit)
64 |
65 |
66 | def test_split_data_matrix(data_container_with_order):
67 | # Test if we can rebuild the matrix from the fragments
68 | data_matrix = data_container_with_order.data_matrix
69 | sample_metadata = data_container_with_order.sample_metadata
70 | sample_class = ["healthy", "disease"]
71 | qc_class = ["QC"]
72 | iterator = _batch_corrector._split_data_matrix(
73 | data_matrix,
74 | sample_metadata,
75 | sample_class,
76 | qc_class,
77 | 0.0
78 | )
79 | rebuilt = np.zeros(shape=data_matrix.shape, dtype=float)
80 | for start, k, order, xgk, _, _ in iterator:
81 | rebuilt[start + np.arange(xgk.size), k] = xgk.flatten()
82 | assert np.array_equal(data_matrix.to_numpy(), rebuilt)
83 |
84 |
85 | def test_rebuild_data_matrix(data_container_with_order):
86 | # Test if we can rebuild the matrix from the fragments
87 | data_matrix = data_container_with_order.data_matrix
88 | sample_metadata = data_container_with_order.sample_metadata
89 | sample_class = ["healthy", "disease"]
90 | qc_class = ["QC"]
91 | iterator = _batch_corrector._split_data_matrix(
92 | data_matrix,
93 | sample_metadata,
94 | sample_class,
95 | qc_class,
96 | 0.0
97 | )
98 |
99 | # compute index used to rebuild the matrix but don't modify the values
100 | def process_chunk(args):
101 | start_index, column, order, x, train_index, predict_index = args
102 | index = np.arange(x.size) + start_index
103 | return x, index, column
104 |
105 | chunks = [process_chunk(x) for x in iterator]
106 | shape = data_matrix.shape
107 | rebuilt = _batch_corrector._rebuild_data_matrix(shape, chunks)
108 | X = data_matrix.to_numpy()
109 | assert np.array_equal(X, rebuilt)
110 |
111 |
112 | def test_find_invalid_samples(data_container_with_order):
113 | data = data_container_with_order
114 | sample_metadata = data.sample_metadata
115 | sample_class = data.mapping["sample"]
116 | qc_class = data.mapping["qc"]
117 | invalid_samples = _batch_corrector.find_invalid_samples(
118 | sample_metadata,
119 | sample_class,
120 | qc_class
121 | )
122 | assert invalid_samples.size == 0
123 |
124 | def test_find_invalid_samples_remove_first_block(data_container_with_order):
125 | # check if study samples with order lower than qc samples are removed
126 | data = data_container_with_order
127 | sample_metadata = data.sample_metadata.copy()
128 | sample_class = data.mapping["sample"]
129 | qc_class = data.mapping["qc"]
130 | # modify one value at the beginning
131 | sample_metadata.at[sample_metadata.index[0], "class"] = sample_class[0]
132 | invalid_samples = _batch_corrector.find_invalid_samples(
133 | sample_metadata,
134 | sample_class,
135 | qc_class
136 | )
137 | assert invalid_samples.size == 1
138 |
139 |
140 | def test_find_invalid_samples_remove_last_block(data_container_with_order):
141 | # check if study samples with order lower than qc samples are removed
142 | data = data_container_with_order
143 | sample_metadata = data.sample_metadata.copy()
144 | sample_class = data.mapping["sample"]
145 | qc_class = data.mapping["qc"]
146 | # modify one value at the beginning
147 | sample_metadata.at[sample_metadata.index[-1], "class"] = sample_class[0]
148 | invalid_samples = _batch_corrector.find_invalid_samples(
149 | sample_metadata,
150 | sample_class,
151 | qc_class
152 | )
153 | assert invalid_samples.size == 1
154 |
155 |
156 | def test_find_invalid_samples_invalid_batch(
157 | data_container_with_order_single_qc):
158 | # check if study samples with order lower than qc samples are removed
159 | data = data_container_with_order_single_qc
160 | sample_metadata = data.sample_metadata
161 | sample_class = data.mapping["sample"]
162 | qc_class = data.mapping["qc"]
163 | # the third batch have only two QC samples and must be removed.
164 | n_invalid = sample_metadata["batch"].value_counts()[3]
165 | invalid_samples = _batch_corrector.find_invalid_samples(
166 | sample_metadata,
167 | sample_class,
168 | qc_class
169 | )
170 | assert invalid_samples.size == n_invalid
171 |
172 |
173 | def test_find_invalid_features(data_container_with_order):
174 | data = data_container_with_order
175 | data_matrix = data.data_matrix
176 | sample_metadata = data.sample_metadata
177 | sample_class = data.mapping["sample"]
178 | qc_class = data.mapping["qc"]
179 | threshold = 0.0
180 | min_detection_rate = 1.0
181 | invalid_features = _batch_corrector.find_invalid_features(
182 | data_matrix,
183 | sample_metadata,
184 | sample_class,
185 | qc_class,
186 | threshold,
187 | min_detection_rate
188 | )
189 | assert invalid_features.size == 0
190 |
191 |
192 | def test_find_invalid_features_threshold(data_container_with_order):
193 | # using high threshold, all features should be removed
194 | data = data_container_with_order
195 | data_matrix = data.data_matrix
196 | sample_metadata = data.sample_metadata
197 | sample_class = data.mapping["sample"]
198 | qc_class = data.mapping["qc"]
199 | threshold = 10000000.0
200 | min_detection_rate = 1.0
201 | invalid_features = _batch_corrector.find_invalid_features(
202 | data_matrix,
203 | sample_metadata,
204 | sample_class,
205 | qc_class,
206 | threshold,
207 | min_detection_rate
208 | )
209 | assert invalid_features.size == data_matrix.shape[1]
--------------------------------------------------------------------------------
/tests/unit/test_chem/test_atoms.py:
--------------------------------------------------------------------------------
1 | from tidyms.chem import atoms
2 | import pytest
3 |
4 |
5 | def test_PeriodicTable_get_element_from_symbol():
6 | ptable = atoms.PeriodicTable()
7 | c = ptable.get_element("C")
8 | assert c.z == 6
9 | assert c.symbol == "C"
10 |
11 |
12 | def test_PeriodicTable_get_element_from_z():
13 | ptable = atoms.PeriodicTable()
14 | p = ptable.get_element(15)
15 | assert p.symbol == "P"
16 | assert p.z == 15
17 |
18 |
19 | def test_PeriodicTable_get_isotope_from_symbol():
20 | ptable = atoms.PeriodicTable()
21 | cl37 = ptable.get_isotope("37Cl")
22 | assert cl37.a == 37
23 | assert cl37.get_symbol() == "Cl"
24 |
25 |
26 | def test_PeriodicTable_get_isotope_copy():
27 | ptable = atoms.PeriodicTable()
28 | isotope_str = "37Cl"
29 | cl37_copy = ptable.get_isotope(isotope_str, copy=True)
30 | cl37 = ptable.get_isotope(isotope_str)
31 | assert cl37.a == cl37_copy.a
32 | assert cl37.m == cl37_copy.m
33 | assert cl37.z == cl37_copy.z
34 | assert cl37 is not cl37_copy
35 |
36 |
37 | @pytest.mark.parametrize(
38 | "z,a,m,abundance,expected_symbol",
39 | [
40 | [6, 12, 12.0, 0.9, "C"], # Carbon. Dummy abundances and exact mass are used.
41 | [1, 1, 1.0078, 0.9, "H"], # Hydrogen
42 | [15, 31, 30.099, 1.0, "P"] # Phosphorus
43 | ]
44 | )
45 | def test_Isotope_get_symbol(z, a, m, abundance, expected_symbol):
46 | isotope = atoms.Isotope(z, a, m, abundance)
47 | assert isotope.get_symbol() == expected_symbol
48 |
49 |
50 | def test_Element_get_monoisotope():
51 | element = atoms.PeriodicTable().get_element("B")
52 | monoisotope = element.get_monoisotope()
53 | assert monoisotope.a == 11
54 |
55 |
56 | def test_Element_get_mmi():
57 | element = atoms.PeriodicTable().get_element("B")
58 | mmi = element.get_mmi()
59 | assert mmi.a == 10
60 |
--------------------------------------------------------------------------------
/tests/unit/test_chem/test_formula.py:
--------------------------------------------------------------------------------
1 | from tidyms.chem import formula
2 | from tidyms.chem.atoms import InvalidIsotope, PeriodicTable
3 | import pytest
4 |
5 |
6 | @pytest.mark.parametrize(
7 | "formula_str,p_open,p_close",
8 | [
9 | ("[Cr[H2O]6]3+", 0, 9),
10 | ("[C9H11NO2]", 0, 9),
11 | ("C9H11N(17O)2", 6, 10),
12 | ("[Cr[(2H)2O]6]3+", 3, 10),
13 | ],
14 | )
15 | def test_find_matching_parenthesis_valid_input(formula_str, p_open, p_close):
16 | test_p_close = formula._find_matching_parenthesis(formula_str, p_open)
17 | assert test_p_close == p_close
18 |
19 |
20 | @pytest.mark.parametrize(
21 | "formula_str,formula_without_charge,charge",
22 | [
23 | ("H2O", "H2O", 0),
24 | ("(13C)", "(13C)", 0),
25 | ("[CO3]2-", "CO3", -2),
26 | ("[Cr[H2O]6]3+", "Cr[H2O]6", 3),
27 | ("[C9H11NO2]", "[C9H11NO2]", 0),
28 | ("CO-", "CO", -1),
29 | ("[H2O]+", "[H2O]", 1),
30 | ("H2O+", "H2O", 1),
31 | ],
32 | )
33 | def test_parse_charge_valid_input(formula_str, formula_without_charge, charge):
34 | test_formula_without_charge, q = formula._parse_charge(formula_str)
35 | assert test_formula_without_charge == formula_without_charge
36 | assert charge == q
37 |
38 |
39 | @pytest.mark.parametrize("formula_str", ["SO42-"])
40 | def test_parse_charge_invalid_input(formula_str):
41 | with pytest.raises(formula.InvalidFormula):
42 | _, q = formula._parse_charge(formula_str)
43 |
44 |
45 | @pytest.mark.parametrize(
46 | "formula_str,ind,token_type",
47 | [
48 | ("H2O", 0, 0),
49 | ("H2(34S)O4", 2, 1),
50 | ("[Cr(H2O)6]3+", 3, 2),
51 | ("[Fe[CN]6]4-", 3, 2),
52 | ],
53 | )
54 | def test_get_token_type(formula_str, ind, token_type):
55 | test_token_type = formula._get_token_type(formula_str, ind)
56 | assert test_token_type == token_type
57 |
58 |
59 | @pytest.mark.parametrize(
60 | "formula_str,ind,coeff,new_ind",
61 | [
62 | ("H2O", 3, 1, 3),
63 | ("CO2", 1, 1, 1),
64 | ("C9H11NO2", 3, 11, 5),
65 | ]
66 | )
67 | def test_get_coefficient_valid_input(formula_str, ind, coeff, new_ind):
68 | test_coeff, test_ind = formula._get_coefficient(formula_str, ind)
69 | assert coeff == test_coeff
70 | assert test_ind == new_ind
71 |
72 |
73 | @pytest.mark.parametrize(
74 | "formula_str,ind,new_ind,element",
75 | [
76 | ("H2O", 0, 2, "H"),
77 | ("H2O", 2, 3, "O"),
78 | ("C9H11NO2", 5, 6, "N"),
79 | ("C9H11N(17O)2", 5, 6, "N"),
80 | ("Cr(H2O)6", 0, 2, "Cr"),
81 | ]
82 | )
83 | def test_tokenize_element_valid_input(formula_str, ind, new_ind, element):
84 | token, test_index = formula._tokenize_element(formula_str, ind)
85 | assert test_index == new_ind
86 | isotope = PeriodicTable().get_element(element).get_monoisotope()
87 | assert isotope in token
88 |
89 |
90 | @pytest.mark.parametrize(
91 | "formula_str,ind,isotope_str,new_ind",
92 | [
93 | ("(13C)O2", 0, "13C", 5),
94 | ("C9H11(15N)2O2", 5, "15N", 11),
95 | ("C6H12O5(18O)", 7, "18O", 12),
96 | ("C6H12O4(18O)2", 7, "18O", 13),
97 | ]
98 | )
99 | def test_tokenize_isotope_valid_input(formula_str, ind, isotope_str, new_ind):
100 | token, test_index = formula._tokenize_isotope(formula_str, ind)
101 | isotope = PeriodicTable().get_isotope(isotope_str)
102 | assert test_index == new_ind
103 | assert isotope in token
104 |
105 |
106 | @pytest.mark.parametrize(
107 | "f_str,composition",
108 | [
109 | ("H2O", {"1H": 2, "16O": 1}),
110 | ("(13C)O2", {"13C": 1, "16O": 2}),
111 | ("C9H11(15N)2O2", {"12C": 9, "1H": 11, "15N": 2, "16O": 2}),
112 | ("C9H11N2O2", {"12C": 9, "1H": 11, "14N": 2, "16O": 2}),
113 | ("Cr[(2H)2O]6", {"52Cr": 1, "2H": 12, "16O": 6})
114 | ]
115 | )
116 | def test_tokenize_formula(f_str, composition):
117 | composition = {PeriodicTable().get_isotope(k): v for k, v in composition.items()}
118 | test_composition = formula._parse_formula(f_str)
119 | for isotope in composition:
120 | assert composition[isotope] == test_composition[isotope]
121 |
122 |
123 | def test_arg_sort_elements():
124 | symbols = ["Cd", "C", "H", "H", "O", "O", "S", "B"]
125 | a = [60, 12, 2, 1, 16, 17, 32, 7]
126 | sorted_ind = [7, 1, 0, 3, 2, 4, 5, 6]
127 | assert sorted_ind == formula._arg_sort_elements(symbols, a)
128 |
129 |
130 | @pytest.mark.parametrize(
131 | "charge,charge_str",
132 | [
133 | (1, "+"),
134 | (2, "2+"),
135 | (-1, "-"),
136 | (-4, "4-")
137 | ]
138 | )
139 | def test_get_charge_str(charge, charge_str):
140 | test_charge_str = formula._get_charge_str(charge)
141 | assert test_charge_str == charge_str
142 |
143 |
144 | @pytest.mark.parametrize(
145 | "f,f_str",
146 | [
147 | (formula.Formula("CO2"), "CO2"),
148 | (formula.Formula("(13C)C2H6O3"), "C2(13C)H6O3"),
149 | (formula.Formula("C24H46SPN(18O)2"), "C24H46N(18O)2PS"),
150 | (formula.Formula("[Cr(H2O)6]3+"), "[H12CrO6]3+"),
151 | (formula.Formula("CH3CH2CH3"), "C3H8"),
152 | (formula.Formula("F2"), "F2"),
153 | ]
154 | )
155 | def test_get_formula_str(f, f_str):
156 | test_f_str = str(f)
157 | assert test_f_str == f_str
158 |
159 |
160 | @pytest.mark.parametrize("f_str", ["(CO2", "#H2O"])
161 | def test_parse_formula_invalid_formula(f_str):
162 | with pytest.raises(formula.InvalidFormula):
163 | formula.Formula(f_str)
164 |
165 |
166 | @pytest.mark.parametrize("f_str", ["(14C)O2", "(3H)2O"])
167 | def test_parse_formula_invalid_isotope(f_str):
168 | with pytest.raises(InvalidIsotope):
169 | formula.Formula(f_str)
170 |
171 |
172 | @pytest.fixture
173 | def formula_data():
174 | formula_str = ["CO2", "H2O", "F2"]
175 | nominal = [44, 18, 38]
176 | exact = [43.9898, 18.0106, 37.9968]
177 | return formula_str, nominal, exact
178 |
179 |
180 | def test_get_exact_mass(formula_data):
181 | formula_str, _, exact = formula_data
182 | for f_str, e in zip(formula_str, exact):
183 | assert abs(formula.Formula(f_str).get_exact_mass() - e) < 0.0001
184 |
185 |
186 | def test_get_nominal_mass(formula_data):
187 | formula_str, nominal, _ = formula_data
188 | for f_str, n in zip(formula_str, nominal):
189 | assert formula.Formula(f_str).get_nominal_mass() == n
190 |
191 |
192 | def test_formula_from_dictionary():
193 | composition = {"C": 1, "17O": 2, "H": 2}
194 | charge = 1
195 | f = formula.Formula(composition, charge)
196 | for k in composition:
197 | assert PeriodicTable().get_isotope(k) in f.composition
198 | assert charge == f.charge
199 |
200 |
201 | def test_formula_from_dictionary_invalid_isotope():
202 | composition = {"C": 1, "G": 4}
203 | charge = 1
204 | with pytest.raises(InvalidIsotope):
205 | formula.Formula(composition, charge)
206 |
207 |
208 | def test_formula_from_dictionary_invalid_isotope_type():
209 | composition = {4: 1, "G": 4}
210 | charge = 1
211 | with pytest.raises(ValueError):
212 | formula.Formula(composition, charge)
213 |
214 |
215 | @pytest.mark.parametrize(
216 | "composition,q",
217 | [
218 | [{"C": -1, "H": 4}, 1],
219 | [{"C": 1, "H": 4}, 0.5],
220 | ]
221 | )
222 | def test_formula_from_dictionary_invalid_coefficient(composition, q):
223 | with pytest.raises(ValueError):
224 | formula.Formula(composition, q)
225 |
226 |
227 | def test_Formula_add():
228 | f1 = formula.Formula("H2O")
229 | f2 = formula.Formula("CO2")
230 | f_sum = f1 + f2
231 | expected = formula.Formula("H2CO3")
232 | assert expected == f_sum
233 |
234 |
235 | def test_Formula_add_invalid_type():
236 | f1 = formula.Formula("H2O")
237 | f2 = "CO2"
238 | with pytest.raises(ValueError):
239 | f1 + f2
240 |
241 |
242 | def test_Formula_subtract_valid():
243 | f1 = formula.Formula("C6H12O6")
244 | f2 = formula.Formula("CO2")
245 | f_diff = f1 - f2
246 | expected = formula.Formula("C5H12O4")
247 | assert expected == f_diff
248 |
249 |
250 | def test_Formula_subtract_invalid_type():
251 | f1 = formula.Formula("C6H12O6")
252 | f2 = "CO2"
253 | with pytest.raises(ValueError):
254 | f1 - f2
255 |
256 |
257 | def test_Formula_subtract_valid_zero_coeff():
258 | f1 = formula.Formula("C4H8O2")
259 | f2 = formula.Formula("CO2")
260 | f_diff = f1 - f2
261 | expected = formula.Formula("C3H8")
262 | assert expected == f_diff
263 |
264 |
265 | def test_Formula_subtract_invalid_coeff():
266 | f1 = formula.Formula("C4H8O")
267 | f2 = formula.Formula("CO2")
268 | with pytest.raises(ValueError):
269 | f1 - f2
270 |
--------------------------------------------------------------------------------
/tests/unit/test_chem/test_isotope_distributions.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | from tidyms.chem import _envelope_utils as ids
4 | from tidyms.chem import Formula, PeriodicTable
5 | from itertools import product
6 |
7 |
8 | @pytest.mark.parametrize(
9 | "isotope_symbol,n,max_length",
10 | product(["2H", "31P"], [0, 1, 5], [1, 2, 5]))
11 | def test__get_n_isotopes_envelope(isotope_symbol: str, n: int, max_length: int):
12 | isotope = PeriodicTable().get_isotope(isotope_symbol)
13 | M, p = ids._get_n_isotopes_envelope(isotope, n, max_length)
14 | M_expected = np.zeros(max_length)
15 | M_expected[0] = n * isotope.m
16 | p_expected = np.zeros(max_length)
17 | p_expected[0] = 1.0
18 | assert np.array_equal(M, M_expected)
19 | assert np.array_equal(p, p_expected)
20 |
21 |
22 | def test__validate_abundance_valid_value():
23 | symbol = "C"
24 | c = PeriodicTable().get_element(symbol)
25 | mc, _, _ = c.get_abundances()
26 | p = np.array([0.8, 0.2])
27 | ids._validate_abundance(p, mc, symbol)
28 |
29 |
30 | def test__validate_abundance_negative_values():
31 | symbol = "C"
32 | c = PeriodicTable().get_element(symbol)
33 | mc, _, _ = c.get_abundances()
34 | p = np.array([0.8, -0.01])
35 | with pytest.raises(ValueError):
36 | ids._validate_abundance(p, mc, symbol)
37 |
38 |
39 | def test__validate_abundance_non_normalized():
40 | symbol = "C"
41 | c = PeriodicTable().get_element(symbol)
42 | mc, _, _ = c.get_abundances()
43 | p = np.array([0.8, 0.21])
44 | with pytest.raises(ValueError):
45 | ids._validate_abundance(p, mc, symbol)
46 |
47 |
48 | def test__validate_abundance_invalid_length():
49 | symbol = "C"
50 | c = PeriodicTable().get_element(symbol)
51 | mc, _, _ = c.get_abundances()
52 | p = np.array([0.8, 0.015, 0.05])
53 | with pytest.raises(ValueError):
54 | ids._validate_abundance(p, mc, symbol)
55 |
56 |
57 | @pytest.mark.parametrize(
58 | "n_isotopes,n",
59 | [[1, 1], [1, 2], [1, 5], [1, 10], [2, 1], [2, 5], [2, 20], [5, 1], [5, 10]]
60 | )
61 | def test__find_n_isotopes_combination(n_isotopes, n):
62 | comb = ids._find_n_isotope_combination(n_isotopes, n)
63 | expected = [x for x in product(range(n + 1), repeat=n_isotopes) if sum(x) == n]
64 | expected = np.array(expected)
65 | # check that the row content is equal
66 | for x in expected:
67 | assert x in comb
68 | for x in comb:
69 | assert x in expected
70 |
71 |
72 | @pytest.mark.parametrize(
73 | "element,max_length",
74 | product(["C", "S"], [2, 5, 10]))
75 | def test__get_n_atoms_envelope_aux_n_1(element: str, max_length: int):
76 | element = PeriodicTable().get_element(element)
77 | me, Me, pe = element.get_abundances()
78 | M, p = ids._get_n_atoms_envelope_aux(me, Me, pe, 1, max_length)
79 | Me, pe = ids._fill_missing_nominal(me, Me, pe, max_length)
80 | assert np.allclose(M, Me)
81 | assert np.allclose(p, pe / np.sum(pe))
82 |
83 |
84 | def test__get_n_atoms_envelope_aux_c_n_3_max_length_3():
85 | element = PeriodicTable().get_element("C")
86 | m_c12 = 12
87 | m_c13 = element.isotopes[13].m
88 | me, Me, pe = element.get_abundances()
89 | n = 3
90 | max_length = 3
91 | M, p = ids._get_n_atoms_envelope_aux(me, Me, pe, n, max_length)
92 | M_expected = np.array([3 * m_c12, 2 * m_c12 + m_c13, 12 + 2 * m_c13])
93 | assert np.allclose(M, M_expected)
94 | assert np.allclose(np.sum(pe), 1.0)
95 |
96 |
97 | def test__get_n_atoms_envelope_aux_c_n_3_max_length_5():
98 | element = PeriodicTable().get_element("C")
99 | m_c12 = 12
100 | m_c13 = element.isotopes[13].m
101 | me, Me, pe = element.get_abundances()
102 | n = 3
103 | max_length = 5
104 | M, p = ids._get_n_atoms_envelope_aux(me, Me, pe, n, max_length)
105 | M_expected = np.array([3 * m_c12, 2 * m_c12 + m_c13, 12 + 2 * m_c13, 3 * m_c13, 0])
106 | assert np.allclose(M, M_expected)
107 | assert np.allclose(np.sum(pe), 1.0)
108 |
109 |
110 | def test__get_n_atoms_envelope_aux_s_n_2_max_length_3():
111 | element = PeriodicTable().get_element("S")
112 | me, Me, pe = element.get_abundances()
113 | n = 2
114 | max_length = 3
115 | M, p = ids._get_n_atoms_envelope_aux(me, Me, pe, n, max_length)
116 | assert np.array_equal(M.round().astype(int), np.array([64, 65, 66]))
117 | assert np.allclose(np.sum(pe), 1.0)
118 |
119 |
120 | def test__get_n_atoms_envelope_aux_s_n_2_max_length_10():
121 | element = PeriodicTable().get_element("S")
122 | me, Me, pe = element.get_abundances()
123 | n = 2
124 | max_length = 10
125 | M, p = ids._get_n_atoms_envelope_aux(me, Me, pe, n, max_length)
126 | M_rounded = np.array([64, 65, 66, 67, 68, 69, 70, 0, 72, 0])
127 | assert np.array_equal(M.round().astype(int), M_rounded)
128 | assert np.allclose(np.sum(pe), 1.0)
129 |
130 |
131 | def test__get_n_atoms_envelope():
132 | element = PeriodicTable().get_element("C")
133 | c12 = element.isotopes[12]
134 | me, Me, pe = element.get_abundances()
135 | M, p = ids._get_n_atoms_envelope(c12, 1, 2)
136 | assert np.allclose(M, Me)
137 | assert np.allclose(p, pe)
138 |
139 |
140 | def test__get_n_atoms_envelope_custom_abundance():
141 | element = PeriodicTable().get_element("C")
142 | c12 = element.isotopes[12]
143 | me, Me, pe = element.get_abundances()
144 | pe = np.array([0.8, 0.2])
145 | M, p = ids._get_n_atoms_envelope(c12, 1, 2, p=pe)
146 | assert np.allclose(M, Me)
147 | assert np.allclose(p, pe)
148 |
149 |
150 | def test__fill_missing_nominal_no_fill():
151 | # carbon element do not need to feel missing values.
152 | max_length = 5
153 | m = np.array([24, 25, 26, 0, 0])
154 | M = np.array([24.1, 24.2, 24.3, 0, 0])
155 | p = np.array([0.5, 0.3, 0.2, 0, 0])
156 | M_fill, p_fill = ids._fill_missing_nominal(m, M, p, max_length)
157 | assert np.allclose(M_fill, M)
158 | assert np.allclose(p_fill, p)
159 |
160 |
161 | def test__fill_missing_nominal_fill():
162 | # Cl does not have an M + 1 isotope and must be filled.
163 | max_length = 5
164 | m = np.array([105, 107, 109])
165 | M = np.array([105.1, 107.2, 109.3])
166 | p = np.array([0.5, 0.3, 0.2])
167 | M_fill, p_fill = ids._fill_missing_nominal(m, M, p, max_length)
168 | M_expected = np.array([M[0], 0, M[1], 0, M[2]])
169 | p_expected = np.array([p[0], 0, p[1], 0, p[2]])
170 | assert np.allclose(M_fill, M_expected)
171 | assert np.allclose(p_fill, p_expected)
172 |
173 |
174 | def test__combine_envelopes_one_row_array():
175 | c12 = PeriodicTable().get_isotope("12C")
176 | max_length = 10
177 | n1 = 2
178 | n2 = 5
179 | n = n1 + n2
180 | M1, p1 = ids._get_n_atoms_envelope(c12, n1, max_length)
181 | M1 = M1.reshape((1, M1.size))
182 | p1 = p1.reshape((1, p1.size))
183 | M2, p2 = ids._get_n_atoms_envelope(c12, n2, max_length)
184 | M2 = M2.reshape((1, M1.size))
185 | p2 = p2.reshape((1, p1.size))
186 | M, p = ids.combine_envelopes(M1, p1, M2, p2)
187 | M_expected, p_expected = ids._get_n_atoms_envelope(c12, n, max_length)
188 | M_expected = M_expected.reshape((1, M_expected.size))
189 | p_expected = p_expected.reshape((1, p_expected.size))
190 | assert np.allclose(M, M_expected)
191 | assert np.allclose(p, p_expected)
192 |
193 |
194 | def test__combine_envelopes_multiple_row_array():
195 | c12 = PeriodicTable().get_isotope("12C")
196 | n_rep = 5
197 | max_length = 10
198 | n1 = 2
199 | n2 = 5
200 | n = n1 + n2
201 | M1, p1 = ids._get_n_atoms_envelope(c12, n1, max_length)
202 | M1 = np.tile(M1, (n_rep, 1))
203 | p1 = np.tile(p1, (n_rep, 1))
204 | M2, p2 = ids._get_n_atoms_envelope(c12, n2, max_length)
205 | M2 = np.tile(M2, (n_rep, 1))
206 | p2 = np.tile(p2, (n_rep, 1))
207 | M, p = ids.combine_envelopes(M1, p1, M2, p2)
208 | M_expected, p_expected = ids._get_n_atoms_envelope(c12, n, max_length)
209 | M_expected = np.tile(M_expected, (n_rep, 1))
210 | p_expected = np.tile(p_expected, (n_rep, 1))
211 | assert np.allclose(M, M_expected)
212 | assert np.allclose(p, p_expected)
213 |
214 |
215 | def test_find_formula_abundances():
216 | f = Formula("CO2")
217 | max_length = 10
218 | ids.find_formula_envelope(f.composition, max_length)
219 |
--------------------------------------------------------------------------------
/tests/unit/test_chem/test_isotope_scorer.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | from tidyms.chem import EnvelopeScorer, EnvelopeValidator
4 | from tidyms.chem import Formula, get_chnops_bounds
5 |
6 |
7 | formula_str_list = ["C11H12N2O2", "C6H12O6", "C27H46O", "CO2", "HCOOH"]
8 |
9 |
10 | @pytest.mark.parametrize("f_str", formula_str_list)
11 | def test_EnvelopeValidator_find_valid_bounds(f_str):
12 | max_length = 5
13 | bounds = get_chnops_bounds(500)
14 | validator = EnvelopeValidator(bounds, max_length=max_length)
15 | f = Formula(f_str)
16 | M, p = f.get_isotopic_envelope(max_length)
17 | tolerance = 0.005
18 | validator.generate_envelopes(M, p, tolerance)
19 | # results are not strictly equal due to being computed using a subset
20 | # of elements in the validator
21 | # a tolerance is used to check validity in M and p
22 | p_tol = 0.0001
23 | M_tol = 0.0000001
24 | for k in range(M.size):
25 | min_M, max_M, min_p, max_p = validator._find_bounds(k)
26 | assert min_M - M_tol < M[k] < max_M + M_tol
27 | assert min_p - p_tol < p[k] < max_p + p_tol
28 |
29 |
30 | @pytest.mark.parametrize("f_str", formula_str_list)
31 | def test_EnvelopeValidator_validate(f_str):
32 | max_length = 5
33 | bounds = get_chnops_bounds(500)
34 | validator = EnvelopeValidator(bounds, max_length=max_length)
35 | f = Formula(f_str)
36 | M, p = f.get_isotopic_envelope(max_length)
37 | validated_length = validator.validate(M, p)
38 | assert validated_length == max_length
39 |
40 |
41 | def test_EnvelopeValidator_validate_invalid_envelope():
42 | max_length = 5
43 | bounds = get_chnops_bounds(500)
44 | validator = EnvelopeValidator(bounds, max_length=max_length)
45 | f = Formula("C2H8B")
46 | M, p = f.get_isotopic_envelope(max_length)
47 | validated_length = validator.validate(M, p)
48 | expected_length = 0
49 | assert validated_length == expected_length
50 |
51 |
52 | @pytest.mark.parametrize("f_str", formula_str_list)
53 | def test_EnvelopeScorer(f_str):
54 | # test that the best scoring candidate has the same molecular formula
55 | f = Formula(f_str)
56 | max_length = 5
57 | bounds = get_chnops_bounds(500)
58 | chnops_scorer = EnvelopeScorer(bounds, max_length=max_length)
59 | M, p = f.get_isotopic_envelope(max_length)
60 | tolerance = 0.005
61 | chnops_scorer.score(M, p, tolerance)
62 | coeff, isotopes, score = chnops_scorer.get_top_results(5)
63 | expected_coeff = [f.composition[x] for x in isotopes]
64 | assert np.array_equal(expected_coeff, coeff[0])
65 |
66 |
67 | @pytest.mark.parametrize("f_str", formula_str_list)
68 | def test_EnvelopeScorer_length_gt_scorer_max_length(f_str):
69 | # test that the best scoring candidate has the same molecular formula
70 | f = Formula(f_str)
71 | max_length = 3
72 | bounds = get_chnops_bounds(500)
73 | chnops_scorer = EnvelopeScorer(bounds, max_length=max_length)
74 | M, p = f.get_isotopic_envelope(max_length + 1)
75 | tolerance = 0.005
76 |
77 | with pytest.raises(ValueError):
78 | chnops_scorer.score(M, p, tolerance)
79 | coeff, isotopes, score = chnops_scorer.get_top_results(5)
80 | expected_coeff = [f.composition[x] for x in isotopes]
81 | assert np.array_equal(expected_coeff, coeff[0])
82 |
83 |
84 | @pytest.mark.parametrize("f_str", formula_str_list)
85 | def test_EnvelopeScorer_custom_scorer(f_str):
86 |
87 | def cosine_scorer(mz1, ab1, mz2, ab2, **scorer_params):
88 | n1 = np.linalg.norm(ab1)
89 | n2 = np.linalg.norm(ab2)
90 | norm = np.linalg.norm(ab1 - ab2)
91 | cosine = norm / (n1 * n2)
92 | return 1 - cosine
93 |
94 | f = Formula(f_str)
95 | max_length = 5
96 | M, p = f.get_isotopic_envelope(max_length)
97 | bounds = get_chnops_bounds(500)
98 | envelope_scorer = EnvelopeScorer(bounds, scorer=cosine_scorer, max_length=max_length)
99 | tolerance = 0.005
100 | envelope_scorer.score(M, p, tolerance)
101 | coeff, isotopes, score = envelope_scorer.get_top_results(5)
102 | expected_coeff = [f.composition[x] for x in isotopes]
103 | assert np.array_equal(expected_coeff, coeff[0])
104 |
105 |
106 | @pytest.fixture
107 | def positive_elements_scorer():
108 | bounds = {"C": (0, 10), "H": (0, 10), "N": (0, 10)}
109 | return EnvelopeScorer(bounds, max_length=5)
110 |
111 |
112 | @pytest.mark.parametrize("f_str", ["C2H3N", "N2H4", "C3N3H3"])
113 | def test_EnvelopeScorer_positive_defect_elements_only(f_str, positive_elements_scorer):
114 | f = Formula(f_str)
115 | max_length = positive_elements_scorer.max_length
116 | M, p = f.get_isotopic_envelope(max_length)
117 | tolerance = 0.005
118 | positive_elements_scorer.score(M, p, tolerance)
119 | coeff, isotopes, score = positive_elements_scorer.get_top_results(5)
120 | expected_coeff = [f.composition[x] for x in isotopes]
121 | assert np.array_equal(expected_coeff, coeff[0])
122 |
123 |
124 | @pytest.fixture
125 | def negative_elements_scorer():
126 | bounds = {"C": (0, 10), "O": (0, 10), "S": (0, 10)}
127 | return EnvelopeScorer(bounds, max_length=5)
128 |
129 |
130 | @pytest.mark.parametrize("f_str", ["CS2", "C2OS2", "C3SO"])
131 | def test_EnvelopeScorer_negative_defect_elements_only(f_str, negative_elements_scorer):
132 | f = Formula(f_str)
133 | max_length = negative_elements_scorer.max_length
134 | M, p = f.get_isotopic_envelope(max_length)
135 | tolerance = 0.001
136 | negative_elements_scorer.score(M, p, tolerance)
137 | coeff, isotopes, score = negative_elements_scorer.get_top_results(5)
138 | expected_coeff = [f.composition[x] for x in isotopes]
139 | assert np.array_equal(expected_coeff, coeff[0])
140 |
141 |
142 | @pytest.fixture
143 | def no_carbon_scorer():
144 | bounds = {"H": (0, 10), "O": (0, 5), "S": (0, 5), "P": (0, 5)}
145 | return EnvelopeScorer(bounds, max_length=5)
146 |
147 |
148 | @pytest.mark.parametrize("f_str", ["H2O", "H3PO4", "H2SO4"])
149 | def test_EnvelopeScorer_no_carbon(f_str, no_carbon_scorer):
150 | f = Formula(f_str)
151 | max_length = no_carbon_scorer.max_length
152 | M, p = f.get_isotopic_envelope(max_length)
153 | tolerance = 0.005
154 | no_carbon_scorer.score(M, p, tolerance)
155 | coeff, isotopes, score = no_carbon_scorer.get_top_results(5)
156 | expected_coeff = [f.composition[x] for x in isotopes]
157 | assert np.array_equal(expected_coeff, coeff[0])
158 |
--------------------------------------------------------------------------------
/tests/unit/test_consensus_annotation.py:
--------------------------------------------------------------------------------
1 | from tidyms import consensus_annotation
2 | from tidyms import _constants as c
3 | import pandas as pd
4 | import pytest
5 | from collections import Counter
6 |
7 | @pytest.fixture
8 | def feature_table():
9 | # Three feature labels, all belonging to the same envelope
10 | # rows with -1 are noise.
11 | columns = [c.SAMPLE, c.LABEL, c.ENVELOPE_LABEL, c.ENVELOPE_INDEX, c.CHARGE]
12 | data = [
13 | [0, -1, -1, -1, -1],
14 | [1, -1, -1, -1, -1],
15 | [2, -1, -1, -1, -1],
16 | [0, 0, 0, 0, 1],
17 | [1, 0, 0, 0, 1],
18 | [2, 0, 0, 0, 1],
19 | [3, 0, 0, 1, 1],
20 | [4, 0, 0, 0, 2],
21 | [0, 1, 0, 1, 1],
22 | [1, 1, 0, 1, 1],
23 | [2, 1, 0, 1, 1],
24 | [3, 1, 0, 2, 1],
25 | [4, 1, 0, 1, 2],
26 | [0, 2, 0, 2, 1],
27 | [1, 2, 0, 2, 1],
28 | [2, 2, 0, 2, 1],
29 | [3, 2, 0, 2, 1],
30 | [4, 2, 0, 2, 2],
31 | ]
32 | return pd.DataFrame(data=data, columns=columns)
33 |
34 |
35 | def test__build_graph(feature_table):
36 | graph, annotations = consensus_annotation.vote_annotations(feature_table)
37 | assert len(annotations) == 3
38 | for ft_label, ft_data in annotations.items():
39 | assert ft_data[c.CHARGE] == 1
40 | assert ft_data[c.ENVELOPE_LABEL] == 0
41 | assert ft_data[c.ENVELOPE_INDEX] == ft_label
42 |
43 |
44 | def test__build_graph_nodes(feature_table):
45 | nodes = consensus_annotation._build_graph_nodes(feature_table)
46 | expected = {
47 | 0: {c.CHARGE: 1, c.ENVELOPE_INDEX: 0},
48 | 1: {c.CHARGE: 1, c.ENVELOPE_INDEX: 1},
49 | 2: {c.CHARGE: 1, c.ENVELOPE_INDEX: 2}
50 | }
51 | assert nodes == expected
52 |
53 | def test__build_graph_edges(feature_table):
54 | edges = consensus_annotation._build_graph_edges(feature_table)
55 | edge_count = Counter(edges)
56 | expected = Counter({(0, 1): 4, (0, 2): 4})
57 | assert edge_count == expected
58 |
--------------------------------------------------------------------------------
/tests/unit/test_correspondence.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from tidyms import correspondence
4 | from tidyms import _constants as c
5 | import pytest
6 | from sklearn.cluster import DBSCAN
7 |
8 |
9 | # test make_initial_cluster
10 |
11 | @pytest.mark.parametrize(
12 | "n,k,max_size",
13 | [[20, 2, 10], [100, 4, 125], [200, 25, 1500], [200, 10, 20000]]
14 | )
15 | def test_make_initial_cluster(n, k, max_size):
16 | # n is the number of samples
17 | # k is the number of clusters
18 | # test with several sample sizes and check that the result is the same
19 | # as using DBSCAN without data split
20 | X1 = np.arange(n)
21 | X2 = np.arange(n)
22 | X = np.vstack((X1, X2)).T
23 | X = np.repeat(X, k, axis=0)
24 | X = np.random.permutation(X)
25 | # k cluster, no noise should be present
26 | eps = 0.1
27 | min_samples = round(n * 0.2)
28 | test_cluster = correspondence._cluster_dbscan(
29 | X, eps, min_samples, max_size
30 | )
31 | dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric="chebyshev")
32 | dbscan.fit(X)
33 | expected_cluster = dbscan.labels_
34 | assert np.array_equal(test_cluster, expected_cluster)
35 |
36 |
37 | # test estimate n species
38 |
39 | @pytest.mark.parametrize(
40 | "min_samples,expected",
41 | [[1, np.array([2, 2])], [2, np.array([2, 2])], [3, np.array([0, 0])]])
42 | def test_estimate_n_species_one_class(min_samples, expected):
43 | samples = np.array(
44 | [0] * 4 + [1] * 4 # 8 features detected in total in two samples
45 | )
46 | clusters = np.array(
47 | ([0] * 2 + [1] * 2) * 2 # two clusters
48 | )
49 | n_clusters = 2
50 | # two species in two clusters are expected
51 | res = correspondence._estimate_n_species_one_class(
52 | samples, clusters, min_samples, n_clusters
53 | )
54 | assert np.array_equal(res, expected)
55 |
56 |
57 | def test_estimate_n_species_multiple_groups():
58 | samples = np.array(
59 | [0] * 4 + [1] * 4 + [2] * 4 # 12 features in three samples
60 | )
61 | clusters = np.array(
62 | ([0] * 2 + [1] * 2) * 3 # two clusters
63 | )
64 | classes = np.array(
65 | [0] * 8 + [1] * 4 # two groups
66 | )
67 | min_dr = 0.5
68 | # two species in two clusters are expected
69 | expected = {0: 2, 1: 2}
70 | include_classes = [0, 1]
71 | samples_per_class = {0: 2, 1: 1}
72 |
73 | res = correspondence._estimate_n_species(
74 | samples, clusters, classes, samples_per_class, include_classes, min_dr)
75 | assert res == expected
76 |
77 |
78 | # test _get_min_samples
79 |
80 | @pytest.fixture
81 | def samples_per_class():
82 | res = {
83 | 0: 8,
84 | 1: 16,
85 | 2: 24
86 | }
87 | return res
88 |
89 |
90 | def test_get_min_samples_include_classes_none(samples_per_class):
91 | min_fraction = 0.25
92 | include_classes = None
93 | test_min_samples = correspondence._get_min_sample(
94 | samples_per_class, include_classes, min_fraction)
95 | expected_min_samples = round(sum(samples_per_class.values()) * min_fraction)
96 | assert expected_min_samples == test_min_samples
97 |
98 |
99 | def test_get_min_samples_include_classes(samples_per_class):
100 | min_fraction = 0.25
101 | include_classes = [0, 1]
102 | test_min_samples = correspondence._get_min_sample(
103 | samples_per_class, include_classes, min_fraction)
104 | n_include = [v for k, v in samples_per_class.items()if k in include_classes]
105 | expected_min_samples = round(min(n_include) * min_fraction)
106 | assert expected_min_samples == test_min_samples
107 |
108 |
109 | def test_process_cluster_one_species():
110 | np.random.seed(1234)
111 | # features
112 | n = 200
113 | X = np.random.normal(size=(n, 2))
114 | samples = np.arange(n)
115 |
116 | # add noise
117 | n_noise = 10
118 | noise = np.random.normal(size=(n_noise, 2), loc=4)
119 | X = np.vstack((X, noise))
120 | s_noise = np.random.choice(samples, size=n_noise)
121 | samples = np.hstack((samples, s_noise))
122 |
123 | expected = np.array([0] * n + [-1] * n_noise)
124 |
125 | n_species = 1
126 | max_deviation = 4
127 | labels, score = correspondence._process_cluster(
128 | X, samples, n_species, max_deviation)
129 | assert np.array_equal(labels, expected)
130 |
131 |
132 | def test_process_cluster_two_species():
133 | np.random.seed(1234)
134 | # features
135 | n = 200
136 | x_list = list()
137 | s_list = list()
138 | for loc in [0, 4]:
139 | x_list.append(np.random.normal(size=(n, 2), loc=loc))
140 | s_list.append(np.arange(n))
141 |
142 | # add noise
143 | n_noise = 10
144 | x_list.append(np.random.normal(size=(n_noise, 2), loc=8))
145 | X = np.vstack(x_list)
146 | s_list.append(np.random.choice(s_list[0], size=n_noise))
147 | samples = np.hstack(s_list)
148 |
149 | expected = np.array([0] * n + [1] * n + [-1] * n_noise)
150 |
151 | n_species = 2
152 | max_deviation = 4
153 | labels, score = correspondence._process_cluster(
154 | X, samples, n_species, max_deviation)
155 | assert np.array_equal(labels, expected)
156 |
157 |
158 | def test_match_features():
159 | np.random.seed(1234)
160 | # features
161 | n = 200
162 | x_list = list()
163 | s_list = list()
164 | for loc in [0, 4]:
165 | x_list.append(np.random.normal(size=(n, 2), loc=loc))
166 | s_list.append(np.arange(n))
167 |
168 | # add noise
169 | n_noise = 10
170 | x_list.append(np.random.normal(size=(n_noise, 2), loc=8))
171 | X = np.vstack(x_list)
172 | s_list.append(np.random.choice(s_list[0], size=n_noise))
173 | samples = np.hstack(s_list)
174 |
175 | feature_table = pd.DataFrame(X, columns=["mz", "rt"])
176 | feature_table[c.SAMPLE] = samples
177 | feature_table[c.CLASS] = 0
178 | samples_per_class = {0: 200}
179 |
180 | expected = np.array([0] * n + [1] * n + [-1] * n_noise)
181 |
182 | labels = correspondence.match_features(
183 | feature_table, samples_per_class, None, 2, 2, 0.25, 4, verbose=True)
184 | labels = labels[c.LABEL]
185 |
186 | assert np.array_equal(labels, expected)
187 |
--------------------------------------------------------------------------------
/tests/unit/test_fileio.py:
--------------------------------------------------------------------------------
1 | from tidyms import fileio
2 | from tidyms.utils import get_tidyms_path
3 | import os
4 | import pytest
5 |
6 |
7 | def test_read_mzmine():
8 | dataset_name = "test-mzmine"
9 | cache_path = get_tidyms_path()
10 | data_path = os.path.join(cache_path, dataset_name)
11 | data_matrix_path = os.path.join(data_path, "data.csv")
12 | sample_metadata_path = os.path.join(data_path, "sample.csv")
13 | try:
14 | fileio.read_mzmine(data_matrix_path, sample_metadata_path)
15 | except FileNotFoundError:
16 | fileio.download_dataset(dataset_name)
17 | fileio.read_mzmine(data_matrix_path, sample_metadata_path)
18 | assert True
19 |
20 |
21 | def test_read_progenesis():
22 | # progenesis data is contained in one file
23 | dataset_name = "test-progenesis"
24 | cache_path = get_tidyms_path()
25 | data_path = os.path.join(cache_path, dataset_name)
26 | data_matrix_path = os.path.join(data_path, "data.csv")
27 | try:
28 | fileio.read_progenesis(data_matrix_path)
29 | except FileNotFoundError:
30 | fileio.download_dataset(dataset_name)
31 | fileio.read_progenesis(data_matrix_path)
32 | assert True
33 |
34 |
35 | def test_read_xcms():
36 | dataset_name = "test-xcms"
37 | cache_path = get_tidyms_path()
38 | data_path = os.path.join(cache_path, dataset_name)
39 | data_matrix_path = os.path.join(data_path, "data.csv")
40 | sample_metadata_path = os.path.join(data_path, "sample.csv")
41 | feature_metadata_path = os.path.join(data_path, "feature.csv")
42 | try:
43 | fileio.read_xcms(data_matrix_path, feature_metadata_path,
44 | sample_metadata_path)
45 | except FileNotFoundError:
46 | fileio.download_dataset(dataset_name)
47 | fileio.read_xcms(data_matrix_path, feature_metadata_path,
48 | sample_metadata_path)
49 | assert True
50 |
51 |
52 | def test_read_compressed_indexed_mzml(centroid_mzml):
53 | n_spectra = centroid_mzml.get_n_spectra()
54 | n_chromatogram = centroid_mzml.get_n_chromatograms()
55 |
56 | # test spectra
57 | for k in range(n_spectra):
58 | centroid_mzml.get_spectrum(k)
59 |
60 | # test chromatogram
61 | for k in range(n_chromatogram):
62 | centroid_mzml.get_chromatogram(k)
63 |
64 | assert True
65 |
66 |
67 | def test_read_uncompressed_indexed_mzml():
68 | cache_path = get_tidyms_path()
69 | filename = "centroid-data-indexed-uncompressed.mzML"
70 | data_path = os.path.join(cache_path, "test-raw-data", filename)
71 | ms_data = fileio.MSData.create_MSData_instance(data_path)
72 | n_spectra = ms_data.get_n_spectra()
73 | n_chromatogram = ms_data.get_n_chromatograms()
74 |
75 | # test spectra
76 | for k in range(n_spectra):
77 | ms_data.get_spectrum(k)
78 |
79 | # test chromatogram
80 | for k in range(n_chromatogram):
81 | ms_data.get_n_chromatograms()
82 |
83 | assert True
84 |
85 |
86 | def test_read_compressed_no_index_mzml():
87 | cache_path = get_tidyms_path()
88 | filename = "centroid-data-zlib-no-index-compressed.mzML"
89 | data_path = os.path.join(cache_path, "test-raw-data", filename)
90 | ms_data = fileio.MSData.create_MSData_instance(data_path)
91 | n_spectra = ms_data.get_n_spectra()
92 | n_chromatogram = ms_data.get_n_chromatograms()
93 |
94 | # test spectra
95 | for k in range(n_spectra):
96 | ms_data.get_spectrum(k)
97 |
98 | # test chromatogram
99 | for k in range(n_chromatogram):
100 | ms_data.get_n_chromatograms()
101 |
102 | assert True
103 |
104 |
105 | def test_get_spectra_iterator_start(centroid_mzml):
106 | start = 9
107 | sp_iterator = centroid_mzml.get_spectra_iterator(start=start)
108 | for scan, sp in sp_iterator:
109 | assert scan >= start
110 |
111 |
112 | def test_get_spectra_iterator_end(centroid_mzml):
113 | expected_end = 20
114 | sp_iterator = centroid_mzml.get_spectra_iterator(end=expected_end)
115 | for scan, sp in sp_iterator:
116 | assert scan < expected_end
117 |
118 |
119 | def test_get_spectra_iterator_ms_level(centroid_mzml):
120 | expected_ms_level = 2
121 | sp_iterator = centroid_mzml.get_spectra_iterator(ms_level=expected_ms_level)
122 | for scan, sp in sp_iterator:
123 | assert sp.ms_level == expected_ms_level
124 |
125 |
126 | def test_get_spectra_iterator_start_time(centroid_mzml):
127 | start_time = 10
128 | sp_iterator = centroid_mzml.get_spectra_iterator(start_time=start_time)
129 | for scan, sp in sp_iterator:
130 | assert sp.time >= start_time
131 |
132 |
133 | def test_get_spectra_iterator_end_time(centroid_mzml):
134 | end_time = 20
135 | sp_iterator = centroid_mzml.get_spectra_iterator(end_time=end_time)
136 | for scan, sp in sp_iterator:
137 | assert sp.time < end_time
138 |
139 |
140 | def test_centroids(profile_mzml):
141 | profile_mzml.get_spectrum(0).find_centroids()
142 | assert True
143 |
144 |
145 | def test_load_dataset():
146 | for d in fileio.list_available_datasets():
147 | fileio.load_dataset(d)
148 |
149 |
150 | def test_load_dataset_invalid_dataset():
151 | with pytest.raises(ValueError):
152 | fileio.load_dataset("invalid-dataset")
153 |
--------------------------------------------------------------------------------
/tests/unit/test_fill_missing.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tidyms as ms
3 |
4 |
5 | def test_get_fill_area_no_peaks_detected(monkeypatch):
6 | time = np.arange(100)
7 | spint = np.ones_like(time)
8 | chromatogram = ms.Chromatogram(time, spint)
9 | rt = 50
10 | rt_std = 10
11 | n_dev = 1
12 |
13 | def mock_extract_features(self, **kwargs):
14 | self.features = list()
15 |
16 | monkeypatch.setattr(ms.Chromatogram, "extract_features", mock_extract_features)
17 |
18 | area = ms.fill_missing._get_fill_area(chromatogram, rt, rt_std, n_dev)
19 | assert area is None
20 |
21 |
22 | def test_get_fill_area_peak_detected_outside_valid_range(monkeypatch):
23 | time = np.arange(100)
24 | spint = np.ones_like(time)
25 | chromatogram = ms.Chromatogram(time, spint)
26 | rt = 50
27 | rt_std = 10
28 | n_dev = 1
29 |
30 | def mock_extract_features(self, **kwargs):
31 | self.features = [ms.lcms.Peak(70, 75, 80, self)]
32 |
33 | monkeypatch.setattr(ms.Chromatogram, "extract_features", mock_extract_features)
34 |
35 | area = ms.fill_missing._get_fill_area(chromatogram, rt, rt_std, n_dev)
36 | assert area is None
37 |
38 |
39 | def test_get_fill_area_peak_detected_inside_valid_range(monkeypatch):
40 | time = np.arange(100)
41 | spint = np.ones_like(time)
42 | chromatogram = ms.Chromatogram(time, spint)
43 | chromatogram.baseline = np.zeros_like(time)
44 | rt = 50
45 | rt_std = 10
46 | n_dev = 1
47 | test_peak = ms.lcms.Peak(50, 55, 60, chromatogram)
48 | expected_area = test_peak.get_area()
49 |
50 | def mock_extract_features(self, **kwargs):
51 | self.features = [test_peak]
52 |
53 | monkeypatch.setattr(ms.Chromatogram, "extract_features", mock_extract_features)
54 |
55 | area = ms.fill_missing._get_fill_area(chromatogram, rt, rt_std, n_dev)
56 | assert np.isclose(area, expected_area)
57 |
58 |
59 | def test_get_fill_area_multiple_valid_peaks_choose_closest(monkeypatch):
60 | time = np.arange(100)
61 | spint = np.ones_like(time)
62 | chromatogram = ms.Chromatogram(time, spint)
63 | chromatogram.baseline = np.zeros_like(time)
64 | rt = 50
65 | rt_std = 10
66 | n_dev = 1
67 | valid_peak = ms.lcms.Peak(45, 50, 52, chromatogram)
68 | detected_peaks = [valid_peak, ms.lcms.Peak(55, 60, 65, chromatogram)]
69 | expected_area = valid_peak.get_area()
70 |
71 | def mock_extract_features(self, **kwargs):
72 | self.features = detected_peaks
73 |
74 | monkeypatch.setattr(ms.Chromatogram, "extract_features", mock_extract_features)
75 |
76 | area = ms.fill_missing._get_fill_area(chromatogram, rt, rt_std, n_dev)
77 | assert np.isclose(area, expected_area)
78 |
--------------------------------------------------------------------------------
/tests/unit/test_filter.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import tidyms as ms
4 |
5 |
6 | def test_class_remover(data_container_with_order):
7 | rm = ["QC"]
8 | data = data_container_with_order
9 | n_qc_samples = (data.sample_metadata["class"] == 'QC').sum()
10 | n_samples = data.data_matrix.shape[0]
11 | crm = ms.filter.ClassRemover(rm)
12 | crm.process(data)
13 | assert data.data_matrix.shape[0] == (n_samples - n_qc_samples)
14 |
15 |
16 | def test_class_remover_invalid_class(data_container_with_order):
17 | rm = ["invalid_class"]
18 | data = data_container_with_order
19 | crm = ms.filter.ClassRemover(rm)
20 | n_samples = data.data_matrix.shape[0]
21 | crm.process(data)
22 | assert data.data_matrix.shape[0] == n_samples
23 |
24 |
25 | # def test_prevalence_filter_remove_none(data_container_with_order):
26 | # data = data_container_with_order
27 | # process_classes = None
28 | # lb = 0
29 | # ub = 1
30 | # intraclass = True
31 | # threshold = 0
32 | # pf = ms.filter.PrevalenceFilter(process_classes=process_classes, lb=lb,
33 | # ub=ub, intraclass=intraclass,
34 | # threshold=threshold)
35 | # pf.process(data)
36 | # assert True
37 | #
38 | #
39 | # def test_prevalence_filter_remove_one_feature(data_container_with_order):
40 | # data = data_container_with_order
41 | # rm_ft = "FT01"
42 | # data._data_matrix.loc[:, rm_ft] = 0
43 | # process_classes = None
44 | # lb = 0.1
45 | # ub = 1
46 | # intraclass = True
47 | # threshold = 0
48 | # pf = ms.filter.PrevalenceFilter(process_classes=process_classes,
49 | # lb=lb,
50 | # ub=ub,
51 | # intraclass=intraclass,
52 | # threshold=threshold)
53 | # pf.process(data)
54 | # assert rm_ft in pf.remove
55 | #
56 | #
57 | # def test_blank_filter_custom_func(data_container_with_order):
58 | # data = data_container_with_order
59 | # bc = ms.filter.BlankCorrector(mode=lambda x: 20)
60 | # bc.process(data)
61 | # assert (data._data_matrix[data.classes
62 | # .isin(bc.params["process_classes"])] == 0).all().all()
63 | #
64 | #
65 | # def test_variation_filter(data_container_with_order):
66 | # data = data_container_with_order
67 | # vf = ms.filter.VariationFilter(lb=0,
68 | # ub=0.2,
69 | # process_classes=None)
70 | # vf.process(data)
71 | # print(vf.remove)
72 | # assert vf.remove.empty
73 |
--------------------------------------------------------------------------------
/tests/unit/test_peaks.py:
--------------------------------------------------------------------------------
1 | import tidyms as ms
2 | import numpy as np
3 | import pytest
4 | from scipy.signal.windows import gaussian
5 | from scipy.special import erfc
6 | from scipy.ndimage import gaussian_filter1d
7 | # from itertools import product
8 |
9 | # random seed
10 | SEED = 1234
11 |
12 |
13 | # noise estimation tests
14 |
15 | @pytest.fixture
16 | def noise():
17 | sigma = 1.0
18 | np.random.seed(SEED)
19 | return np.random.normal(size=500, scale=sigma), sigma
20 |
21 |
22 | def test_estimate_local_noise_empty_signal():
23 | x = np.array([])
24 | noise = ms.peaks._estimate_local_noise(x)
25 | assert np.isclose(noise, 0.0)
26 |
27 |
28 | @pytest.mark.parametrize("x", [np.array([1]), np.array([1, 2])])
29 | def test_estimate_local_noise_signal_length_lower_than_two(x):
30 | noise = ms.peaks._estimate_local_noise(x)
31 | assert np.isclose(noise, 0.0)
32 |
33 |
34 | def test_estimate_local_noise(noise):
35 | # check that the noise estimation is close to the std of a normal
36 | # distribution
37 | x, sigma = noise
38 | noise_estimation = ms.peaks._estimate_local_noise(x)
39 | # noise should be close to sigma, check with a 20 % tolerance
40 | assert (sigma < 1.2 * noise_estimation)
41 |
42 |
43 | def test_estimate_local_noise_non_robust(noise):
44 | x, sigma = noise
45 | noise_estimation = ms.peaks._estimate_local_noise(x, robust=False)
46 | # noise should be close to sigma, check with a 20 % tolerance
47 | assert (sigma < 1.2 * noise_estimation)
48 |
49 |
50 | def test_estimate_noise_empty_array():
51 | x = np.array([])
52 | noise = ms.peaks.estimate_noise(x)
53 | assert noise.size == 0.0
54 |
55 |
56 | @pytest.mark.parametrize("x", [np.array([1]), np.array([1, 3]),
57 | np.array([1, 4, 6])])
58 | def test_estimate_noise_signal_length_lower_than_two(x):
59 | noise_estimation = ms.peaks.estimate_noise(x)
60 | assert np.allclose(noise_estimation, 0.0)
61 |
62 |
63 | def test_estimate_noise_check_size(noise):
64 | noise, sigma = noise
65 | noise_estimation = ms.peaks.estimate_noise(noise, n_slices=2)
66 | assert noise.size == noise_estimation.size
67 |
68 |
69 | def test_estimate_noise_n_slices(noise):
70 | noise, sigma = noise
71 | noise_estimation = ms.peaks.estimate_noise(noise, n_slices=2)
72 | size = noise.size
73 | half = size // 2
74 | # check that the noise estimation was done for 2 slices
75 | assert np.allclose(noise_estimation[:half], noise_estimation[0])
76 | assert np.allclose(noise_estimation[half:], noise_estimation[half])
77 | # check that the estimation on each slice is different
78 | assert noise_estimation[0] != noise_estimation[half]
79 |
80 |
81 | def test_estimate_noise_min_slice_size(noise):
82 | noise, sigma = noise
83 | n_slices = 5
84 | min_slice_size = 150
85 | noise_estimation = ms.peaks.estimate_noise(noise, n_slices=n_slices,
86 | min_slice_size=min_slice_size)
87 | # noise has a size of 500, the slice is going to be 100 < 150
88 | # check that 150 is used instead.
89 | slice_boundaries = [0, 150, 300, 500] # the last slice is extended to 200
90 | # to prevent the creation of a slice of size 50
91 | for k in range(len(slice_boundaries) - 1):
92 | start = slice_boundaries[k]
93 | end = slice_boundaries[k + 1]
94 | assert np.allclose(noise_estimation[start:end], noise_estimation[start])
95 |
96 |
97 | # Test baseline estimation
98 |
99 | def test_find_local_extrema():
100 | x = np.arange(10)
101 | # reflect and merge the concatenate x. local extrema should be 0, 9, 19
102 | x = np.hstack((x, x[::-1]))
103 | test_output = ms.peaks._find_local_extrema(x)
104 | expected_output = [0, 9, 19]
105 | assert np.array_equal(test_output, expected_output)
106 |
107 |
108 | def test_find_local_extrema_no_local_maximum():
109 | x = np.arange(10)
110 | test_output = ms.peaks._find_local_extrema(x)
111 | expected_output = np.array([])
112 | assert np.array_equal(test_output, expected_output)
113 |
114 |
115 | test_noise_sum_params = [[np.array([0, 1]), np.sqrt([25, 25])],
116 | [np.array([0]), np.sqrt([34])]]
117 |
118 |
119 | @pytest.mark.parametrize("index,expected", test_noise_sum_params)
120 | def test_get_noise_sum_slice_std(index, expected):
121 | index = np.array(index)
122 | expected = np.array(expected)
123 | x = np.array([3, 4, 2, 2, 1])
124 | test_output = ms.peaks._get_noise_slice_sum_std(x, index)
125 | assert np.allclose(test_output, expected)
126 |
127 |
128 | def test_estimate_noise_probability():
129 | noise = np.ones(7)
130 | x = np.array([0, 0.1, 0.4, 2, 1.25, 1.1, 1.0])
131 | extrema = np.array([0, 3, 6])
132 | # two slices of size 4 and 2 respectively, the expected output should
133 | # be erfc(1/sqrt(2) and erfc(1)
134 | expected_output = erfc([2.5 * np.sqrt(1 / 2) / 2,
135 | 1.35 * np.sqrt(1 / 2) / 2])
136 | test_output = ms.peaks._estimate_noise_probability(noise, x, extrema)
137 | assert np.allclose(expected_output, test_output)
138 |
139 |
140 | def test_build_baseline_index():
141 | x = np.array([0, 1, 2, 1, 0, 1, 2, 1, 0, 1, 2, 1, 0])
142 | extrema = np.array([0, 2, 4, 6, 8, 10, 12])
143 | noise_probability = np.array([0, 0.25, 0.25, 0.25, 0, 0])
144 | min_proba = 0.05
145 | expected = np.array([0, 4, 5, 6, 12])
146 | test = ms.peaks._build_baseline_index(x, noise_probability, min_proba,
147 | extrema)
148 | assert np.array_equal(expected, test)
149 |
150 |
151 | def test_estimate_baseline():
152 | # a simple test, a noise array is built using a noise level greater
153 | # than the noise level in the signal. All points should be classified as
154 | # baseline
155 | n = 100
156 | x = np.random.normal(size=n, scale=1)
157 | noise = np.ones(n) * 5
158 | baseline = ms.peaks.estimate_baseline(x, noise)
159 | expected_baseline_index = np.arange(n)
160 | test_baseline_index = np.where(np.abs(x - baseline) < noise)[0]
161 | assert np.array_equal(expected_baseline_index, test_baseline_index)
162 |
163 |
164 | @pytest.fixture
165 | def single_peak(noise):
166 | noise, sigma = noise
167 | x = gaussian(noise.size, 2) * 20
168 | return x
169 |
170 |
171 | @pytest.fixture
172 | def two_non_overlapping_peaks(noise):
173 | noise, sigma = noise
174 | x = np.arange(noise.size)
175 | params = np.array([[100, 2, 50], [150, 2, 25]])
176 | y = ms.utils.gaussian_mixture(x, params).sum(axis=0)
177 | return y, params
178 |
179 |
180 | def test_detect_peaks_one_peak(single_peak, noise):
181 | noise, sigma = noise
182 | x = single_peak + noise
183 | noise_estimation = ms.peaks.estimate_noise(x)
184 | # smooth x to reduce the number of detected peaks
185 | x = gaussian_filter1d(x, 1.0)
186 | baseline_estimation = ms.peaks.estimate_baseline(x, noise)
187 | peaks = ms.peaks.detect_peaks(x, noise_estimation, baseline_estimation)
188 | assert len(peaks[0]) == 1
189 |
190 |
191 | def test_detect_peaks_two_non_overlapping_peaks(two_non_overlapping_peaks,
192 | noise):
193 | noise, sigma = noise
194 | x, _ = two_non_overlapping_peaks
195 | x = x + noise
196 | noise_estimation = ms.peaks.estimate_noise(x)
197 | # smooth x to reduce the number of detected peaks
198 | x = gaussian_filter1d(x, 1.0)
199 | baseline_estimation = ms.peaks.estimate_baseline(x, noise)
200 | peaks = ms.peaks.detect_peaks(x, noise_estimation, baseline_estimation)
201 | assert len(peaks[0]) == 2
202 |
203 |
204 | @pytest.fixture
205 | def two_overlapping_peaks(noise):
206 | noise, sigma = noise
207 | x = np.arange(noise.size)
208 | params = np.array([[100, 2, 50], [108, 2, 25]])
209 | y = ms.utils.gaussian_mixture(x, params).sum(axis=0)
210 | return y, params
211 |
212 |
213 | def test_detect_peaks_two_overlapping_peaks(two_overlapping_peaks, noise):
214 | noise, sigma = noise
215 | x, _ = two_overlapping_peaks
216 | x = x + noise
217 | noise_estimation = ms.peaks.estimate_noise(x)
218 | # smooth x to reduce the number of detected peaks
219 | x = gaussian_filter1d(x, 1.0)
220 | baseline_estimation = ms.peaks.estimate_baseline(x, noise)
221 | peaks = ms.peaks.detect_peaks(x, noise_estimation, baseline_estimation)
222 | start, apex, end = peaks
223 | # only two peaks are detected
224 | assert len(start) == 2
225 | # check the boundary of the overlapping peaks
226 | assert end[0] == (start[1] + 1)
227 |
--------------------------------------------------------------------------------
/tests/unit/test_validation.py:
--------------------------------------------------------------------------------
1 | from tidyms.validation import *
2 | import pytest
3 |
4 |
5 | @pytest.fixture
6 | def example_validator():
7 | schema = {
8 | "positive_number": {"is_positive": True},
9 | "a": {"lower_than": "b"},
10 | "b": {"lower_or_equal": "c"},
11 | "c": {"type": "number"},
12 | "some_function": {"check_with": is_callable}
13 | }
14 | return ValidatorWithLowerThan(schema)
15 |
16 |
17 | def test_is_positive_positive_number(example_validator):
18 | params = {"positive_number": 5}
19 | validate(params, example_validator)
20 | assert True
21 |
22 |
23 | def test_is_positive_zero(example_validator):
24 | params = {"positive_number": 0}
25 | with pytest.raises(ValueError):
26 | validate(params, example_validator)
27 |
28 |
29 | def test_is_positive_negative_number(example_validator):
30 | params = {"positive_number": -1}
31 | with pytest.raises(ValueError):
32 | validate(params, example_validator)
33 |
34 |
35 | def test_lower_than_valid(example_validator):
36 | # a must be lower than b
37 | params = {"a": 5, "b": 6}
38 | validate(params, example_validator)
39 | assert True
40 |
41 |
42 | def test_lower_than_invalid(example_validator):
43 | # a must be lower than b
44 | params = {"a": 5, "b": 4}
45 | with pytest.raises(ValueError):
46 | validate(params, example_validator)
47 |
48 |
49 | def test_lower_than_invalid_equal(example_validator):
50 | # a must be lower than b
51 | params = {"a": 5, "b": 5}
52 | with pytest.raises(ValueError):
53 | validate(params, example_validator)
54 |
55 |
56 | def test_lower_or_equal_valid(example_validator):
57 | # a must be lower than b
58 | params = {"b": 5, "c": 7}
59 | validate(params, example_validator)
60 | assert True
61 |
62 |
63 | def test_lower_or_equal_valid_equal(example_validator):
64 | # a must be lower than b
65 | params = {"b": 5, "c": 5}
66 | validate(params, example_validator)
67 | assert True
68 |
69 |
70 | def test_lower_or_equal_invalid(example_validator):
71 | # a must be lower than b
72 | params = {"b": 5, "c": 4}
73 | with pytest.raises(ValueError):
74 | validate(params, example_validator)
75 |
76 |
77 | def test_is_callable_valid(example_validator):
78 | # a must be lower than b
79 | params = {"some_function": sum}
80 | validate(params, example_validator)
81 | assert True
82 |
83 |
84 | def test_is_callable_invalid(example_validator):
85 | # a must be lower than b
86 | params = {"some_function": "invalid_value"}
87 | with pytest.raises(ValueError):
88 | validate(params, example_validator)
89 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist =
3 | python3.9,python3.10
4 |
5 | [testenv]
6 | deps= -rtest_requirements.txt
7 | commands=pytest
--------------------------------------------------------------------------------