├── .coveragerc ├── .flake8 ├── .gitignore ├── .readthedocs.yml ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── conftest.py ├── docs ├── Makefile ├── _templates │ └── autosummary │ │ ├── base.rst │ │ ├── class.rst │ │ └── module.rst ├── api.rst ├── bokeh_plots.py ├── chem.rst ├── conf.py ├── data-curation.rst ├── descriptors.csv ├── examples │ ├── DARTMS_MTBLS1198_SeaOmics__processing.ipynb │ ├── DARTMS_processing.ipynb │ ├── DARTMS_processing_ParameterOptimization.ipynb │ ├── custom_peak_descriptors.py │ ├── defined_spots_supervised.tsv │ └── roi-creation.py ├── feature-correspondence.rst ├── fileio.rst ├── fileio_tutorial.rst ├── glossary.rst ├── index.rst ├── installation.rst ├── mzml.rst ├── peak-picking.rst ├── plots │ ├── dbscan-clustering.py │ ├── dbscan-parameters.py │ ├── gmm-clustering.py │ ├── peak-definition.py │ ├── peak-detection-example.py │ ├── peak_detection_baseline_example.py │ └── roi-definition.py ├── preprocessing-steps.csv ├── processing_datasets.rst ├── quickstart.rst ├── requirements.txt └── tutorials.rst ├── pyproject.toml ├── requirements.txt ├── src └── tidyms │ ├── __init__.py │ ├── _batch_corrector.py │ ├── _build_data_matrix.py │ ├── _constants.py │ ├── _filter_functions.py │ ├── _mzml.py │ ├── _plot_bokeh.py │ ├── annotation │ ├── __init__.py │ ├── annotation.py │ ├── annotation_data.py │ ├── envelope_finder.py │ └── mmi_finder.py │ ├── assay.py │ ├── chem │ ├── __init__.py │ ├── _envelope_utils.py │ ├── _formula_generator.py │ ├── atoms.py │ ├── elements.json │ ├── envelope_tools.py │ ├── formula.py │ ├── isotopes.json │ └── utils.py │ ├── consensus_annotation.py │ ├── container.py │ ├── correspondence.py │ ├── dartms.py │ ├── fileio.py │ ├── fill_missing.py │ ├── filter.py │ ├── lcms.py │ ├── peaks.py │ ├── raw_data_utils.py │ ├── simulation.py │ ├── utils.py │ └── validation.py ├── test_requirements.txt ├── tests ├── __init__.py ├── conftest.py ├── integration │ ├── test_assay_real_data.py │ └── test_real_raw_data.py └── unit │ ├── annotation │ ├── test_annotation.py │ ├── test_envelope_finder.py │ └── test_mmi_finder.py │ ├── test_assay.py │ ├── test_batch_corrector.py │ ├── test_build_data_matrix.py │ ├── test_chem │ ├── test_atoms.py │ ├── test_formula.py │ ├── test_formula_generator.py │ ├── test_isotope_distributions.py │ └── test_isotope_scorer.py │ ├── test_consensus_annotation.py │ ├── test_correspondence.py │ ├── test_data_container.py │ ├── test_fileio.py │ ├── test_fill_missing.py │ ├── test_filter.py │ ├── test_lcms.py │ ├── test_peaks.py │ ├── test_raw_data_utils.py │ ├── test_utils.py │ └── test_validation.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | tidyms/validation.py 4 | tidyms/_plot_bokeh.py 5 | 6 | [report] 7 | exclude_lines = 8 | def plot 9 | pragma: no cover 10 | def __repr__ -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | extend-ignore = E203, E501 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # C generated files by Cython 10 | *.c 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | docs/generated/ 75 | docs/_static/*.html 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # celery beat schedule file 98 | celerybeat-schedule 99 | 100 | # SageMath parsed files 101 | *.sage.py 102 | 103 | # Environments 104 | .env 105 | .venv 106 | env/ 107 | venv/ 108 | ENV/ 109 | env.bak/ 110 | venv.bak/ 111 | 112 | # Spyder project settings 113 | .spyderproject 114 | .spyproject 115 | 116 | # Rope project settings 117 | .ropeproject 118 | 119 | # mkdocs documentation 120 | /site 121 | 122 | # mypy 123 | .mypy_cache/ 124 | .dmypy.json 125 | dmypy.json 126 | 127 | # Pyre type checker 128 | .pyre/ 129 | 130 | # Pycharm 131 | .idea/ 132 | 133 | # VS code 134 | .vscode/ 135 | *.featureML 136 | *.dill 137 | docs/examples/exportedDataMatrix.tsv 138 | docs/examples/defined_spots_rtShifted.tsv 139 | docs/examples/defined_spots.tsv 140 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sphinx: 4 | configuration: docs/conf.py 5 | 6 | build: 7 | os: ubuntu-20.04 8 | tools: 9 | python: "3.9" 10 | 11 | python: 12 | install: 13 | - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, Bioanalytical mass spectrometry group at CIBION-CONICET 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | include tidyms/chem/elements.json 4 | include tidyms/chem/isotopes.json -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # make file for pytest 2 | 3 | .PHONY: test-unit 4 | test-unit: 5 | pytest --cov=tidyms tests/unit 6 | 7 | .PHONY: test-all 8 | test-all: 9 | pytest --cov=tidyms 10 | 11 | .PHONY: coverage 12 | coverage: 13 | pytest --cov=tidyms && coverage html -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | TidyMS: Tools for working with MS data in metabolomics 2 | ====================================================== 3 | 4 | TidyMS is a python library for processing Mass Spectrometry data. It aims to 5 | provide easy to use tools to read, process and visualize MS data generated in 6 | metabolomic studies. 7 | 8 | Features 9 | -------- 10 | 11 | TidyMS provides functionality to: 12 | 13 | 1. Read raw MS data in the mzML format 14 | 2. Spectrum and chromatogram creation. 15 | 3. Powerful and flexible peak picking functions optimized for chromatographic 16 | and spectral data. 17 | 4. Feature detection and feature correspondence in LC-MS data. 18 | 5. Reading processed data in a variety of formats (XCMS, MZMine2, ...) 19 | 5. Data matrix curation using widely accepted guidelines from the metabolomics 20 | community. 21 | 6. Interactive visualizations of raw and processed data using Bokeh, or 22 | publication quality plots using seaborn. 23 | 24 | Installation 25 | ------------ 26 | 27 | The latest release can be installed from PyPI: 28 | 29 | ``` 30 | pip install tidyms 31 | ``` 32 | 33 | Examples 34 | -------- 35 | 36 | Jupyter notebooks with examples are available 37 | [here](https://github.com/griquelme/tidyms-notebooks). 38 | 39 | Tests 40 | ----- 41 | 42 | TidyMS uses unit tests for most of its functionality. 43 | The tests can be executed with 44 | ``` 45 | python setup.py test 46 | ``` 47 | 48 | Documentation 49 | ------------- 50 | 51 | The official documentation is available at 52 | [readthedocs](https://tidyms.readthedocs.io/en/latest/). 53 | 54 | 55 | Citation 56 | -------- 57 | 58 | If you find TidyMS useful, we would appreciate citations: 59 | 60 | Riquelme, G.; Zabalegui, N.; Marchi, P.; Jones, C.M.; Monge, M.E. A Python-Based 61 | Pipeline for Preprocessing LC–MS Data for Untargeted Metabolomics Workflows. 62 | _Metabolites_ **2020**, 10, 416, doi:10.3390/metabo10100416. 63 | 64 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/griquelme/tidyms/ad9356a099f367076f745406be23bb4c50003239/conftest.py -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_templates/autosummary/base.rst: -------------------------------------------------------------------------------- 1 | .. raw:: html 2 | 3 | 4 |
5 | 6 | {{ fullname | escape | underline}} 7 | 8 | .. currentmodule:: {{ module }} 9 | 10 | .. auto{{ objtype }}:: {{ objname }} 11 | 12 | -------------------------------------------------------------------------------- /docs/_templates/autosummary/class.rst: -------------------------------------------------------------------------------- 1 | .. raw:: html 2 | 3 |
4 |
5 | 6 | {{ fullname | escape | underline}} 7 | 8 | .. currentmodule:: {{ module }} 9 | 10 | 11 | .. autoclass:: {{ name }} 12 | :members: 13 | 14 | -------------------------------------------------------------------------------- /docs/_templates/autosummary/module.rst: -------------------------------------------------------------------------------- 1 | {{ fullname }} 2 | {{ underline }} 3 | 4 | .. automodule:: {{fullname}} 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | .. _api: 2 | 3 | .. py:currentmodule:: tidyms 4 | 5 | API reference 6 | ============= 7 | 8 | Tools for working with raw data 9 | ------------------------------- 10 | 11 | .. autosummary:: 12 | :toctree: generated 13 | 14 | tidyms.Assay 15 | tidyms.MSData 16 | tidyms.Chromatogram 17 | tidyms.MSSpectrum 18 | 19 | Tools for working with processed data 20 | ------------------------------------- 21 | 22 | .. autosummary:: 23 | :toctree: generated 24 | 25 | tidyms.DataContainer 26 | tidyms.filter.Pipeline 27 | 28 | List of available filters and processors 29 | ---------------------------------------- 30 | 31 | .. autosummary:: 32 | :toctree: generated 33 | 34 | tidyms.filter.BatchCorrector 35 | tidyms.filter.BlankCorrector 36 | tidyms.filter.ClassRemover 37 | tidyms.filter.DilutionFilter 38 | tidyms.filter.DRatioFilter 39 | tidyms.filter.PrevalenceFilter 40 | tidyms.filter.VariationFilter 41 | 42 | Tools for working with chemical data 43 | ------------------------------------ 44 | 45 | .. autosummary:: 46 | :toctree: generated 47 | 48 | tidyms.chem.Formula 49 | tidyms.chem.PeriodicTable 50 | tidyms.chem.FormulaGenerator 51 | tidyms.chem.EnvelopeScorer 52 | 53 | Module reference 54 | ---------------- 55 | 56 | .. autosummary:: 57 | :toctree: generated 58 | 59 | tidyms.container 60 | tidyms.correspondence 61 | tidyms.fileio 62 | tidyms.filter 63 | tidyms.lcms 64 | tidyms.peaks 65 | tidyms.raw_data_utils 66 | tidyms.utils 67 | tidyms.chem.atoms 68 | tidyms.chem.envelope_tools 69 | tidyms.chem.formula 70 | 71 | tidyms.dartms -------------------------------------------------------------------------------- /docs/bokeh_plots.py: -------------------------------------------------------------------------------- 1 | from bokeh import plotting 2 | import tidyms as ms 3 | import numpy as np 4 | from pathlib import Path 5 | 6 | 7 | seed = 1234 8 | 9 | 10 | def create_chromatogram() -> ms.Chromatogram: 11 | 12 | filename = "NZ_20200227_039.mzML" 13 | dataset = "test-nist-raw-data" 14 | ms.fileio.download_tidyms_data(dataset, [filename]) 15 | path = Path(ms.fileio.get_tidyms_path()) 16 | path = path.joinpath(dataset, filename) 17 | 18 | ms_data = ms.MSData.create_MSData_instance( 19 | path, 20 | ms_mode="centroid", 21 | instrument="qtof", 22 | separation="uplc" 23 | ) 24 | mz_list = np.array([189.0734]) 25 | return ms.make_chromatograms(ms_data, mz_list)[0] 26 | 27 | 28 | def plot_chromatogram(): 29 | plotting.output_file("_static/chromatogram.html") 30 | chromatogram = create_chromatogram() 31 | p = chromatogram.plot(show=False) 32 | plotting.save(p) 33 | 34 | 35 | def plot_chromatogram_with_peaks(): 36 | # generate always the same plot 37 | plotting.output_file("_static/chromatogram-with-peaks.html") 38 | chromatogram = create_chromatogram() 39 | chromatogram.extract_features() 40 | p = chromatogram.plot(show=False) 41 | plotting.save(p) 42 | 43 | 44 | def feature_plot(): 45 | plotting.output_file("_static/feature-plot.html") 46 | data = ms.fileio.load_dataset("reference-materials") 47 | ignore = ["Z", "SV", "B", "SSS", "SCQC"] 48 | # search [M+H]+ from trp in the features 49 | mz = 205.097 50 | rt = 124 51 | # get a list of features compatible with the given m/z and rt 52 | ft_name = data.select_features(mz, rt) 53 | 54 | f = data.plot.feature(ft_name[0], draw=False, ignore_classes=ignore) 55 | plotting.save(f) 56 | 57 | 58 | def pca_plot(): 59 | plotting.output_file("_static/pca-scores.html") 60 | 61 | data = ms.fileio.load_dataset("reference-materials") 62 | ignore = ["Z", "SV", "B", "SSS", "SCQC"] 63 | f = data.plot.pca_scores(fig_params={"height": 250}, 64 | ignore_classes=ignore, 65 | scaling="autoscaling", 66 | draw=False) 67 | plotting.save(f) 68 | 69 | 70 | def create_assay(assay_path) -> ms.Assay: 71 | plotting.output_file("_static/pca-scores.html") 72 | ms.fileio.download_dataset("test-nist-raw-data") 73 | ms.fileio.download_dataset("reference-materials") 74 | tidyms_dir = Path(ms.utils.get_tidyms_path()) 75 | data_path = tidyms_dir.joinpath("test-nist-raw-data") 76 | sample_metadata_path = data_path.joinpath("sample_list.csv") 77 | 78 | assay = ms.Assay( 79 | data_path=data_path, 80 | assay_path=assay_path, 81 | sample_metadata=sample_metadata_path, 82 | separation="uplc", 83 | instrument="qtof" 84 | ) 85 | return assay 86 | 87 | 88 | def plot_roi_assay(assay: ms.Assay, save_path: str): 89 | plotting.output_file(save_path) 90 | sample_name = "NZ_20200227_039" 91 | p = assay.plot.roi(sample_name, show=False) 92 | plotting.save(p) 93 | 94 | 95 | def plot_stacked_chromatogram(assay: ms.Assay): 96 | plotting.output_file("_static/stacked-chromatograms.html") 97 | p = assay.plot.stacked_chromatogram(6, show=False) 98 | plotting.save(p) 99 | 100 | 101 | def create_assay_plots(): 102 | assay_path = "_build/test-assay" 103 | assay = create_assay(assay_path) 104 | mz_list = np.array( 105 | [118.0654, 144.0810, 146.0605, 181.0720, 188.0706, 189.0738, 106 | 195.0875, 205.0969] 107 | ) 108 | make_roi_params = { 109 | "tolerance": 0.015, 110 | "min_intensity": 5000, 111 | "targeted_mz": mz_list, 112 | } 113 | assay.detect_features(verbose=False, **make_roi_params) 114 | plot_roi_assay(assay, "_static/roi-no-peaks.html") 115 | assay.extract_features(store_smoothed=True, verbose=False) 116 | assay.describe_features(verbose=False) 117 | assay.build_feature_table() 118 | assay.match_features(verbose=False) 119 | plot_roi_assay(assay, "_static/roi-peaks.html") 120 | plot_stacked_chromatogram(assay) 121 | 122 | 123 | def create_plots(): 124 | plot_chromatogram() 125 | plot_chromatogram_with_peaks() 126 | feature_plot() 127 | pca_plot() 128 | create_assay_plots() -------------------------------------------------------------------------------- /docs/chem.rst: -------------------------------------------------------------------------------- 1 | .. _working-with-chemical-formulas: 2 | 3 | .. py:currentmodule:: tidyms 4 | 5 | Chemical data utilities 6 | ======================= 7 | 8 | The `chem` module contains utilities to work with chemical data such as isotopes, 9 | elements and formulas. Also, it contain utilities to generate formulas from 10 | exact mass, score isotopic envelopes and search isotopic envelope candidates 11 | from a list of m/z values. 12 | 13 | Searching chemical data 14 | ----------------------- 15 | 16 | :func:`~tidyms.chem.PeriodicTable` contains element and isotope information. 17 | The ``get_element`` method returns a :class:`~tidyms.chem.atom.Element` 18 | 19 | .. code-block:: python 20 | 21 | >>> import tidyms as ms 22 | >>> ptable = ms.chem.PeriodicTable() 23 | >>> oxygen = ptable.get_element("O") 24 | >>> oxygen 25 | Element(O) 26 | 27 | Element information can be retrieved easily: 28 | 29 | .. code-block:: python 30 | 31 | >>> oxygen.z 32 | 8 33 | >>> oxygen.symbol 34 | "O" 35 | >>> oxygen.isotopes 36 | {16: Isotope(16O), 17: Isotope(17O), 18: Isotope(18O)} 37 | >>> oxygen.get_monoisotope() 38 | Isotope(16O) 39 | >>> oxygen.get_abundances() 40 | (array([16, 17, 18]), 41 | array([15.99491462, 16.9991317 , 17.999161 ]), 42 | array([9.9757e-01, 3.8000e-04, 2.0500e-03])) 43 | 44 | :class:`~tidyms.chem.atom.Isotope` store exact mass, nominal mass and abundance 45 | of each isotope: 46 | 47 | .. code-block:: python 48 | 49 | >>> o16 = oxygen.get_monoisotope() 50 | >>> o16.m 51 | 15.99491462 52 | >>> o16.a 53 | 16 54 | >>> o16.p 55 | 0.99757 56 | 57 | Working with chemical formulas 58 | ------------------------------ 59 | 60 | Chemical formulas can be created with the :class:`~tidyms.chem.Formula` object: 61 | 62 | .. code-block:: python 63 | 64 | >>> water = ms.chem.Formula("H2O") 65 | >>> water 66 | Formula(H2O) 67 | 68 | Formula objects can be used to compute a formula mass and its isotopic envelope: 69 | 70 | .. code-block:: python 71 | 72 | >>> water.get_exact_mass() 73 | 18.010564684 74 | >>> M, p = water.get_isotopic_envelope() 75 | >>> M 76 | array([18.01056468, 19.01555724, 20.01481138, 21.02108788]) 77 | >>> p 78 | array([9.97340572e-01, 6.09327319e-04, 2.04962911e-03, 4.71450803e-07])) 79 | 80 | Formulas can be created by passing a dictionary of element or isotopes to a 81 | formula coefficient and the numerical charge of the formula. Formulas are 82 | implemented as dictionaries of isotopes to formula coefficients, so if an 83 | element is passed, it is assumed that it is the most abundant isotope. 84 | 85 | .. code-block:: python 86 | 87 | >>> f = ms.chem.Formula({"C": 1, "13C": 1, "O": 4}, 0) 88 | >>> f 89 | Formula(C(13C)O4) 90 | 91 | Isotopes can also be specified in the string format: 92 | 93 | .. code-block:: python 94 | 95 | >>> f = ms.chem.Formula("[C(13C)2H2O4]2-") 96 | Formula([C(13C)2H2O4]2-) 97 | >>> f.charge 98 | -2 99 | 100 | 101 | Sum formula generation 102 | ---------------------- 103 | 104 | The :class:`~tidyms.chem.FormulaGenerator` generates sum formulas from a mass 105 | value. To generate formulas, the space of formula must be defined by using 106 | and passed to the formula generator constructor: 107 | 108 | .. code-block:: python 109 | 110 | >>> bounds = {"C": (0, 20), "H": (0, 40), "O": (0, 10), "N": (0, 5)} 111 | >>> formula_generator = ms.chem.FormulaGenerator(bounds) 112 | 113 | To generate formulas, an exact mass value must be passed, along with a tolerance 114 | to find compatible formulas. 115 | 116 | .. code-block:: python 117 | 118 | >>> f = ms.chem.Formula("C5H10O2") 119 | >>> M = f.get_exact_mass() # Mass value to generate formulas 120 | >>> tolerance = 0.005 121 | >>> formula_generator.generate_formulas(M, tolerance) 122 | >>> coefficients, isotopes, M_coeff = formula_generator.results_to_array() 123 | >>> coefficients 124 | array([[ 0, 10, 2, 4], 125 | [ 3, 8, 3, 1], 126 | [ 5, 10, 0, 2]]) 127 | >>> isotopes 128 | [Isotope(12C), Isotope(1H), Isotope(14N), Isotope(16O)] 129 | 130 | Coefficients is a 2D Numpy array where each row are coefficients of valid 131 | formulas and each column is an isotope. 132 | 133 | Formula generator objects can be created easily by using the static method 134 | :meth:`~tidyms.chem.FormulaGenerator.from_hmdb`, which generates reasonable 135 | coefficients spaces for the CHNOPS elements by finding the maximum coefficients 136 | in compounds from the `Human Metabolome DataBase `_: 137 | 138 | .. code-block:: python 139 | 140 | m = 1000 141 | formula_generator = ms.chem.FormulaGenerator.from_hmdb(m) 142 | 143 | ``m`` defines the maximum mass of the compounds included to create the coefficient 144 | space. ``m`` can take values of 500, 1000, 1500 and 2000. Other element can be 145 | added as follows = 146 | 147 | .. code-block:: python 148 | 149 | m = 1000 150 | bounds = {"Cl": (0, 2) 151 | formula_generator = ms.chem.FormulaGenerator.from_hmdb(m, bounds=bounds) 152 | 153 | 154 | Scoring Isotopic envelopes 155 | -------------------------- 156 | 157 | Scoring measured envelopes against theoretical values is a common strategy 158 | to establish a formula candidate for an unknown compound. The 159 | :class:`~tidyms.chem.EnvelopeScorer` uses the formulas generated by a formula 160 | generator and scores them using a measure of similarity between the measured and 161 | theoretical envelopes: 162 | 163 | .. code-block:: python 164 | 165 | >>> bounds = {"C": (0, 20), "H": (0, 40), "O": (0, 10), "N": (0, 5)} 166 | >>> fg = ms.chem.FormulaGenerator(bounds) 167 | >>> envelope_scorer = ms.chem.EnvelopeScorer(fg, scorer="qtof", max_length=10) 168 | 169 | The `max_length` parameter sets the maximum length of the measured envelopes to 170 | compare against theoretical values. The `scorer` parameter can be ``qtof``, 171 | ``orbitrap`` or a callable that implements a custom scorer. In the first two 172 | cases, default parameters are set for values measured in Q-TOF or Orbitrap 173 | instruments. The score method takes a list of exact mass and abundances of an 174 | envelope and scores against all compatible formulas. See the API for a detailed 175 | description on how to customize the scorer function. The results can be obtained 176 | with the :meth:`tidyms.chem.EnvelopeScorer.get_top_results` method: 177 | 178 | .. code-block:: python 179 | 180 | >>> import numpy as np 181 | >>> f = ms.chem.Formula("C5H10O2") 182 | >>> M, p = f.get_isotopic_envelope(4) # Get first four peaks from the envelope 183 | >>> tolerance = 0.005 184 | >>> envelope_scorer.score(M, p, tolerance) 185 | >>> coefficients, isotopes, score = envelope_scorer.get_top_results() 186 | >>> coefficients[np.argmax(score)] 187 | array([ 5, 10, 0, 2]) 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath(os.path.pardir)) 16 | sys.path.insert(0, os.path.abspath(os.getcwd())) 17 | from bokeh_plots import create_plots 18 | 19 | # -- Project information ----------------------------------------------------- 20 | 21 | project = 'TidyMS' 22 | copyright = "2020, Bioanalytical Mass Spectrometry Group at CIBION-CONICET" 23 | author = 'Gabriel Riquelme' 24 | 25 | # -- generate plot files ----------------------------------------------------- 26 | if not os.path.isdir("_static"): 27 | os.mkdir("_static") 28 | 29 | if not os.path.isdir("_build"): 30 | os.mkdir("_build") 31 | 32 | create_plots() 33 | 34 | # -- General configuration --------------------------------------------------- 35 | 36 | # Add any Sphinx extension module names here, as strings. They can be 37 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 38 | # ones. 39 | extensions = [ 40 | 'sphinx.ext.autodoc', 41 | 'sphinx.ext.mathjax', 42 | 'sphinx.ext.autosummary', 43 | 'sphinx.ext.intersphinx', 44 | 'IPython.sphinxext.ipython_directive', 45 | 'IPython.sphinxext.ipython_console_highlighting', 46 | 'bokeh.sphinxext.bokeh_plot', 47 | 'matplotlib.sphinxext.plot_directive', 48 | 'numpydoc' 49 | ] 50 | 51 | add_module_names = False 52 | # Generate the API documentation when building 53 | autosummary_generate = True 54 | numpydoc_show_class_members = False 55 | 56 | # Add any paths that contain templates here, relative to this directory. 57 | templates_path = ['_templates'] 58 | 59 | # List of patterns, relative to source directory, that match files and 60 | # directories to ignore when looking for source files. 61 | # This pattern also affects html_static_path and html_extra_path. 62 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 63 | 64 | 65 | # -- Options for HTML output ------------------------------------------------- 66 | 67 | # The theme to use for HTML and HTML Help pages. See the documentation for 68 | # a list of builtin themes. 69 | # 70 | html_theme = 'sphinx_rtd_theme' 71 | 72 | # Add any paths that contain custom static files (such as style sheets) here, 73 | # relative to this directory. They are copied after the builtin static files, 74 | # so a file named "default.css" will overwrite the builtin "default.css". 75 | html_static_path = ['_static'] 76 | 77 | intersphinx_mapping = { 78 | 'pandas': ('https://pandas.pydata.org/docs/', None), 79 | 'scipy': ('https://docs.scipy.org/doc/scipy/', None), 80 | } 81 | 82 | # set index.rst as the master doc 83 | master_doc = 'index' 84 | 85 | # include __init__ in docs 86 | autoclass_content = 'both' -------------------------------------------------------------------------------- /docs/descriptors.csv: -------------------------------------------------------------------------------- 1 | Descriptor,Meaning 2 | height,height relative to the baseline 3 | area,area minus the baseline area 4 | rt,weighted average of the retention time in the peak region 5 | mz,weighted average of the m/z in the peak region 6 | width,"width, computed as the region where the 95 % of the peak area is distributed" 7 | snr,"peak signal-to-noise ratio, defined as the quotient between the peak height and the noise level" 8 | mz std,standard deviation of the m/z in the peak region -------------------------------------------------------------------------------- /docs/examples/custom_peak_descriptors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tidyms.peaks import detect_peaks 3 | from tidyms.peaks import get_peak_descriptors 4 | from tidyms.utils import gaussian_mixture 5 | 6 | # always generate the same plot 7 | np.random.seed(1234) 8 | 9 | # create a signal with two gaussian peaks 10 | x = np.arange(100) 11 | gaussian_params = np.array([[25, 3, 30], [50, 2, 60]]) 12 | y = gaussian_mixture(x, gaussian_params).sum(axis=0) 13 | # add a noise term 14 | y += np.random.normal(size=y.size, scale=1) 15 | 16 | # detect_peaks also returns the noise and baseline estimation used 17 | peaks, noise, baseline = detect_peaks(y) -------------------------------------------------------------------------------- /docs/examples/defined_spots_supervised.tsv: -------------------------------------------------------------------------------- 1 | msData_ID spotInd include name group class batch startRT_seconds endRT_seconds comment 2 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 0 False Spot_0 unknown unknown 1 10.15000016 308.5039902 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 3 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 1 True Airblank_1 Airblank unknown 1 326.7699909 367.3630142 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 4 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 2 True Airblank_2 Airblank unknown 1 376.4960003 419.1180038 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 5 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 3 True Airblank_3 Airblank unknown 1 428.2509899 469.8579884 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 6 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 4 True Airblank_4 Airblank unknown 1 478.9920044 521.6139793 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 7 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 5 True Airblank_5 Airblank unknown 1 529.7320175 572.3539925 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 8 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 6 True Airblank_6 Airblank unknown 1 581.4870071 623.0949783 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 9 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 7 True Airblank_7 Airblank unknown 1 632.227993 674.8500252 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 10 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 8 True Airblank_8 Airblank unknown 1 683.9829826 725.5899811 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 11 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 9 True Airblank_9 Airblank unknown 1 734.7229958 776.3310242 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 12 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 10 True Airblank_10 Airblank unknown 1 785.4639816 828.0860138 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 13 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 11 True Airblank_11 Airblank unknown 1 837.2190285 878.8260269 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 14 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING 12 True Airblank_12 Airblank unknown 1 887.9600143 923.4780121 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 15 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 0 False Spot_0 Inj2ul unknown 2 37.54999995 229.3489981 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 16 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 1 True Inj10ul_1 Inj10ul unknown 2 247.6150131 288.2070065 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 17 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 2 True Inj10ul_2 Inj10ul unknown 2 297.3409939 339.9629974 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 18 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 3 True Inj10ul_3 Inj10ul unknown 2 348.081007 390.7030106 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 19 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 4 True Inj10ul_4 Inj10ul unknown 2 399.8359966 441.4439964 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 20 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 5 True Inj10ul_5 Inj10ul unknown 2 450.5770111 493.1989861 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 21 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 6 True Inj10ul_6 Inj10ul unknown 2 501.3170242 543.9389992 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 22 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 7 True Inj2ul_1 Inj2ul unknown 2 553.0729866 595.6950188 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 23 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 8 True Inj2ul_2 Inj2ul unknown 2 603.8129997 646.4349747 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 24 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 9 True Inj2ul_3 Inj2ul unknown 2 655.5690193 698.1909943 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 25 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 10 True Inj2ul_4 Inj2ul unknown 2 706.3089752 748.9310074 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 26 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 11 True Inj2ul_5 Inj2ul unknown 2 757.0500183 799.6719933 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 27 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY 12 True Inj2ul_6 Inj2ul unknown 2 808.8050079 844.3230057 spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000) 28 | -------------------------------------------------------------------------------- /docs/examples/roi-creation.py: -------------------------------------------------------------------------------- 1 | from ftplib import FTP 2 | import tidyms as ms 3 | import os 4 | 5 | # this code downloads an example file from Metabolights via ftp 6 | study_path = "pub/databases/metabolights/studies/public/MTBLS1919" 7 | sample_path = os.path.join(study_path, "Applications/Centroid_data") 8 | filename = "NZ_20200227_041.mzML" 9 | ftp = FTP("ftp.ebi.ac.uk") 10 | ftp.login() 11 | ftp.cwd(sample_path) 12 | with open(filename, "wb") as fin: 13 | ftp.retrbinary("RETR " + filename, fin.write) 14 | ftp.close() 15 | 16 | # specifying instrument and separation used in the experiments provides better 17 | # default values for several functions used in 18 | ms_data = ms.MSData.create_MSData_instance( 19 | filename, 20 | ms_mode="centroid", 21 | instrument="qtof", 22 | separation="uplc" 23 | ) 24 | roi_list = ms_data.make_roi() 25 | -------------------------------------------------------------------------------- /docs/fileio.rst: -------------------------------------------------------------------------------- 1 | .. _working-with-raw-data: 2 | 3 | .. py:currentmodule:: tidyms 4 | 5 | :orphan: 6 | 7 | Working with raw data 8 | ===================== 9 | 10 | TidyMS works with raw data in the mzML format using the :class:`~tidyms.MSData` 11 | class. In this section we show commons operations on raw data. For file 12 | conversion to the mzML format see :ref:`this guide ` 13 | 14 | For the examples we will use an example mzML file that can be downloaded with 15 | the following code: 16 | 17 | .. code-block:: python 18 | 19 | import numpy as np 20 | import tidyms as ms 21 | 22 | filename = "NZ_20200227_039.mzML" 23 | dataset = "test-nist-raw-data" 24 | ms.fileio.download_tidyms_data(dataset, [filename], download_dir=".") 25 | 26 | 27 | Raw data 28 | -------- 29 | 30 | Raw MS data in the mzML format can be read through the :class:`~tidyms.MSData` 31 | object. 32 | 33 | .. code-block:: python 34 | 35 | ms_data = ms.MSData.create_MSData_instance( 36 | filename, 37 | ms_mode="centroid", 38 | instrument="qtof", 39 | separation="uplc" 40 | ) 41 | 42 | It is necessary to specify if the data is in centroid or profile mode using the 43 | :code:`ms_mode` parameter, as some methods work in different ways for each 44 | type of data. Specifying the :code:`instrument` and :code:`separation` is also 45 | recommended, as these parameters set reasonable defaults in different functions 46 | used. 47 | 48 | :class:`~tidyms.MSData` is optimized for low memory usage and only loads the 49 | required data into memory. A single MS spectrum can be loaded using 50 | :meth:`~tidyms.MSData.get_spectrum` which returns a 51 | :class:`~tidyms.lcms.MSSpectrum`. 52 | 53 | .. code-block:: python 54 | 55 | index = 20 56 | sp = ms_data.get_spectrum(index) 57 | 58 | The index used is the order in which the data was stored in the file. In the 59 | same way, a stored chromatogram can be retrieved using 60 | :meth:`~tidyms.MSData.get_chromatogram`. The total count of spectra and 61 | chromatograms in the file can be obtained using 62 | :meth:`tidyms.MSData.get_n_spectra` and 63 | :meth:`tidyms.MSData.get_n_chromatograms` respectively. Iterating over all 64 | the spectra in a file can be done using 65 | :meth:`~tidyms.MSData.get_spectra_iterator`, which generates each one of the 66 | spectra in the file and allows filtering by acquisition time or MS level. 67 | Common operations with raw data are located in :mod:`tidyms.raw_data_utils`. 68 | 69 | 70 | Working with Mass Spectra 71 | ------------------------- 72 | 73 | :class:`~tidyms.MSSpectrum` stores the information from one scan. It is mostly 74 | used as a data storage class in several data processing steps, but it also has 75 | functionality to visualize the spectrum using the 76 | :meth:`~tidyms.MSSpectrum.plot` method and to convert a profile data spectrum 77 | into centroid mode using :meth:`tidyms.MSSpectrum.find_centroids`. 78 | 79 | :func:`tidyms.raw_data_utils.accumulate_spectra` combines a series of scans in 80 | a file into a single spectrum: 81 | 82 | .. code-block:: python 83 | 84 | combined_sp = ms.accumulate_spectra(ms_data, start_time=110, end_time=115) 85 | 86 | Chromatograms 87 | ------------- 88 | 89 | Besides the chromatograms stored in a file, extracted chromatograms can be 90 | created :func:`tidyms.raw_data_utils.make_chromatograms` which takes an array of 91 | m/z and returns a list :class:`tidyms.Chromatogram` objects, each one associated 92 | to one of the m/z values provided: 93 | 94 | .. code-block:: python 95 | 96 | mz_list = np.array([189.0734, 205.0967, 188.071]) 97 | chromatograms = ms.make_chromatograms(ms_data, mz_list) 98 | 99 | A chromatogram can be visualized using ``plot`` method: 100 | 101 | .. code-block:: python 102 | 103 | chrom = chromatograms[0] 104 | chrom.plot() 105 | 106 | .. raw:: html 107 | 108 | 109 | 110 | Peaks in a chromatogram are detected using 111 | :meth:`tidyms.lcms.LCRoi.extract_features`, which stores a list of 112 | :class:`tidyms.lcms.Peak` objects in the `features` attribute of the 113 | chromatogram. Plotting again the chromatogram shows the detected peaks: 114 | 115 | .. code-block:: python 116 | 117 | chrom.extract_features() 118 | chrom.plot() 119 | 120 | .. raw:: html 121 | 122 | 123 | 124 | Peak descriptors can be obtained using 125 | :meth:`tidyms.lcms.Roi.describe_features`: 126 | 127 | .. code-block:: python 128 | 129 | >>> chrom.describe_features() 130 | [{'height': 16572.38, 'area': 108529.94, 'rt': 125.73, 'width': 14.06, 131 | 'snr': 385.44, 'mz': None, 'mz_std': None}] 132 | 133 | A detailed description of the algorithm used for peak picking can be found 134 | :ref:`here `. These methods are also used to create a data matrix from 135 | a dataset. See :ref:`here ` a tutorial on how to work with 136 | complete datasets to extract a data matrix. 137 | -------------------------------------------------------------------------------- /docs/fileio_tutorial.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/griquelme/tidyms/ad9356a099f367076f745406be23bb4c50003239/docs/fileio_tutorial.rst -------------------------------------------------------------------------------- /docs/glossary.rst: -------------------------------------------------------------------------------- 1 | .. definitions 2 | 3 | Definitions 4 | =========== 5 | 6 | Here is a list of the concepts used in TidyMS. 7 | 8 | .. glossary:: 9 | 10 | batch correction 11 | A correction step applied to reduce the time dependent variation in the 12 | metabolite signals due to instrumental response changes, carryover, 13 | or metabolite degradation, among others. 14 | 15 | blank correction 16 | A correction applied on study samples to remove the contribution to 17 | the signal coming from sample preparation. This process consist in 18 | measuring a set of blank samples and using them to estimate the 19 | sample preparation contribution to the signal. 20 | 21 | carryover 22 | A measurement artifact in LC-MS. Occurs when signals from one sample are 23 | detected in the next sample (signals are “carried over”). 24 | 25 | correction 26 | A data curation step where the data matrix is transformed to correct 27 | the data. 28 | 29 | data curation 30 | The process of reducing the bias introduced in the measurements during 31 | sample preparation and data acquisition. Also, the filtration of samples 32 | that cannot be measured in an analytically robust way. 33 | 34 | data matrix 35 | A matrix of feature values where each row is a sample or observation and 36 | each column is a feature. 37 | 38 | feature 39 | A measurable property of a phenomenon being observed. In LC-MS a feature 40 | is usually represented as a chromatographic peak. 41 | 42 | feature correspondence 43 | The process of match features extracted in different samples. 44 | 45 | feature descriptor 46 | A series of characteristics of a feature. In the case of a 47 | chromatographic peak, feature descriptors can be peak area, retention 48 | time, mean m/z among, others. 49 | 50 | feature detection 51 | The process of finding a feature in a data set. Once a feature is 52 | detected it can be extracted into a feature descriptor. In LC-MS the 53 | feature detection procedure involves the detection of chromatographic 54 | peaks and extraction into rt, m/z and area information. 55 | 56 | feature table 57 | The table obtained after feature extraction, where each row is a 58 | feature detected in a sample and each column is a descriptor. 59 | 60 | filtration 61 | A data curation step where samples or features are removed according 62 | to an specific criteria. 63 | 64 | mapping 65 | A dictionary that maps the sample type to sample classes The available 66 | sample types are: study sample, quality control, blank, system 67 | suitability. 68 | 69 | normalization 70 | An operation on the data matrix to adjust the sample values. Common 71 | normalization methods use different norms, such as the euclidean 72 | norm, Manhattan norm or maximum norm. 73 | 74 | prevalence filter 75 | A filter applied on a data matrix to remove features that are detected 76 | in a low number of samples. 77 | 78 | quality control sample 79 | Samples applied to demonstrate analytical accuracy, precision, and 80 | repeatability after data processing and can be converted to metrics 81 | describing data quality. 82 | 83 | run order 84 | Temporal order in which the different samples were analyzed. 85 | 86 | sample class 87 | The category of the sample. Can be related to the study (e.g: healthy, 88 | disease) or to the experiment design (quality control, blank, etc...). 89 | 90 | sample descriptor 91 | A characteristic of a sample. Can be the sample type, class, run order, 92 | analytical batch. 93 | 94 | sample type 95 | The type of sample used in the experiment. Sample types can be: study 96 | sample, quality control, blank, system suitability. 97 | 98 | scaling 99 | An operation on the data matrix to change the distribution of features. 100 | 101 | system suitability check 102 | The analysis of a series of samples to assess the performance of an 103 | analytical platform. 104 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. TidyMS documentation master file, created by 2 | sphinx-quickstart on Tue May 19 15:53:07 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | TidyMS 7 | ====== 8 | 9 | TidyMS is a python package that provides easy to use tools for processing and 10 | analyzing mass spectrometry based metabolomics data sets. It's built on top 11 | of Numpy, Pandas and scikit-learn. Get started by reading the 12 | :doc:`installation` instructions and then see an overview of the package in the 13 | :doc:`quickstart`. You can also see some applications in the example gallery. For 14 | detailed information about tidyms, you can see the :doc:`api` reference. 15 | 16 | .. toctree:: 17 | :maxdepth: 2 18 | :caption: Contents: 19 | 20 | Glossary 21 | Installation guide 22 | Quickstart 23 | Tutorials 24 | API Reference 25 | 26 | Indices and tables 27 | ================== 28 | 29 | * :ref:`genindex` 30 | * :ref:`modindex` 31 | * :ref:`search` 32 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. installation 2 | 3 | Installation 4 | ============ 5 | 6 | Python version 7 | -------------- 8 | 9 | We recommend to use the last version of Python 3. If you don't have Python 10 | installed we recommend installing it using the `Anaconda`_ distribution . 11 | 12 | .. _Anaconda: https://docs.anaconda.com/anaconda/install/ 13 | 14 | 15 | Install TidyMS 16 | -------------- 17 | 18 | If you already have Python, you can install TidyMS from the Python Package 19 | Index: 20 | 21 | On Linux: 22 | 23 | .. code-block:: sh 24 | 25 | $ pip install tidyms 26 | 27 | On Windows, if you are using Anaconda and didn't add Python to the PATH 28 | environment variable you have to run this command from the conda prompt. 29 | 30 | -------------------------------------------------------------------------------- /docs/mzml.rst: -------------------------------------------------------------------------------- 1 | .. _mzml: 2 | 3 | .. py:currentmodule:: tidyms 4 | 5 | Converting raw data to mzML format 6 | ================================== 7 | 8 | We recommend using `msconvert 9 | `_ to convert raw data 10 | generated from the different instruments to mzML format. Files can be converted 11 | from a GUI or from the command line. To convert all the files with names ending 12 | in :code:`.RAW` inside a directory from the command line the following command 13 | can be used: 14 | 15 | .. code-block:: bat 16 | 17 | msconvert *.RAW -o my_output_dir 18 | 19 | If you are using a Waters instrument with lockspray correction, the 20 | :code:`scanEvent` filter can be used to remove the signal from the lockspray. 21 | 22 | .. code-block:: bat 23 | 24 | msconvert *.RAW --filter "msEvent 1" -o my_output_dir 25 | 26 | To perform feature detection, data must be provided in centroid format. This 27 | can be done using the :code:`peakPicking` filter option: 28 | 29 | .. code-block:: bat 30 | 31 | msconvert data.RAW --filter "peakPicking cwt snr=1 peakSpace=0.01" 32 | 33 | A :code:`snr=1` is recommended as noisy peaks will be removed during feature 34 | detection anyway. :code:`peakSpacing` should be chosen according to the 35 | instrument used. For QTOF instruments a value of 0.01 is recommended, but 36 | for higher resolution instruments, such as orbitrap or FT-ICR, lower values 37 | may be used. 38 | -------------------------------------------------------------------------------- /docs/plots/dbscan-clustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tidyms as ms 3 | import matplotlib.pyplot as plt 4 | 5 | np.random.seed(1234) 6 | n = 200 7 | X1 = np.random.normal(size=(n, 2)) 8 | samples = np.hstack((np.arange(n), np.arange(n))) 9 | X2 = np.random.normal(size=(n, 2), loc=(2, 2)) 10 | X = np.vstack((X1, X2)) 11 | 12 | dbscan_labels = ms.correspondence._cluster_dbscan(X, 2.0, 50, 10000) 13 | gmm_labels, score = ms.correspondence._process_cluster(X, samples, 2, 3.0) 14 | 15 | fig, ax = plt.subplots() 16 | for l in np.unique(dbscan_labels): 17 | ax.scatter(*X[dbscan_labels == l].T, label=l) 18 | 19 | ax.set_xlabel("m/z") 20 | ax.set_ylabel("Rt") 21 | ax.legend(title="DBSCAN labels") -------------------------------------------------------------------------------- /docs/plots/dbscan-parameters.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.cluster import DBSCAN 4 | import seaborn as sns 5 | from itertools import product 6 | 7 | sns.set_context("paper", font_scale=1.25) 8 | 9 | 10 | sample_size = [10, 20, 50, 100, 200, 500] 11 | fractions = [0.1, 0.25, 0.5, 0.75, 1.0] 12 | eps = [0.5, 1.0, 2.0, 3.0, 4.0] 13 | n_reps = 5 14 | results = list() 15 | 16 | for k_rep, size, f, e in product(range(n_reps), sample_size, fractions, eps): 17 | X = np.random.normal(size=(size, 2)) 18 | min_samples = round(size * f) 19 | dbscan = DBSCAN(eps=e, min_samples=min_samples, metric="chebyshev") 20 | dbscan.fit(X) 21 | cluster = dbscan.labels_ 22 | noise_fraction = (cluster == -1).sum() / size 23 | results.append([k_rep, size, f, e, noise_fraction]) 24 | df_normal = pd.DataFrame( 25 | data=results, 26 | columns=["rep", "sample size", "sample fraction", "eps", "noise fraction"] 27 | ) 28 | 29 | sns.catplot( 30 | data=df_normal, 31 | x="eps", 32 | y="noise fraction", 33 | palette="Set1", 34 | col="sample size", 35 | hue="sample fraction", 36 | legend="full", 37 | col_wrap=2, 38 | s=8 39 | ) -------------------------------------------------------------------------------- /docs/plots/gmm-clustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tidyms as ms 3 | import matplotlib.pyplot as plt 4 | 5 | np.random.seed(1234) 6 | n = 200 7 | X1 = np.random.normal(size=(n, 2)) 8 | samples = np.hstack((np.arange(n), np.arange(n))) 9 | X2 = np.random.normal(size=(n, 2), loc=(2, 2)) 10 | X = np.vstack((X1, X2)) 11 | 12 | dbscan_labels = ms.correspondence._cluster_dbscan(X, 2.0, 50, 10000) 13 | gmm_labels, score = ms.correspondence._process_cluster(X, samples, 2, 3.0) 14 | 15 | fig, ax = plt.subplots() 16 | for l in np.unique(gmm_labels): 17 | ax.scatter(*X[gmm_labels == l].T, label=l) 18 | 19 | ax.set_xlabel("m/z") 20 | ax.set_ylabel("Rt") 21 | ax.legend(title="GMM labels") -------------------------------------------------------------------------------- /docs/plots/peak-definition.py: -------------------------------------------------------------------------------- 1 | import tidyms as ms 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # always generate the same plot 6 | np.random.seed(1234) 7 | 8 | grid = np.arange(50) 9 | signal = ms.utils.gauss(grid, 25, 2, 30) 10 | noise = np.random.normal(size=signal.size, scale=1) 11 | x = signal + noise + 3 12 | peak = ms.lcms.Peak(19, 25, 30) 13 | fig, ax = plt.subplots(figsize=(6, 6)) 14 | ax.plot(grid, x, label="signal") 15 | ax.scatter(grid[peak.start], x[peak.start], label="peak start", s=50) 16 | ax.scatter(grid[peak.apex], x[peak.apex], label="peak apex", s=50) 17 | ax.scatter(grid[peak.end], x[peak.end], label="peak end", s=50) 18 | ax.fill_between(grid[peak.start:peak.end + 1], 19 | x[peak.start:peak.end + 1], alpha=0.2, label="peak region") 20 | ax.annotate(text='', xy=(grid[peak.end + 5], x[peak.end]), 21 | xytext=(grid[peak.end + 5], x[peak.apex]), 22 | arrowprops=dict(arrowstyle='<->')) 23 | ax.annotate(text='peak \n prominence', 24 | xy=(grid[peak.end + 10],x[peak.apex] / 2)) 25 | ax.legend() 26 | -------------------------------------------------------------------------------- /docs/plots/peak-detection-example.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from tidyms import peaks 4 | from tidyms.lcms import Peak 5 | from tidyms.utils import gaussian_mixture 6 | 7 | # always generate the same plot 8 | np.random.seed(1234) 9 | 10 | # create a signal with two gaussian peaks 11 | x = np.arange(100) 12 | gaussian_params = np.array([[25, 3, 30], [50, 2, 60]]) 13 | y = gaussian_mixture(x, gaussian_params).sum(axis=0) 14 | # add a noise term 15 | y += np.random.normal(size=y.size, scale=0.5) 16 | 17 | noise_estimation = peaks.estimate_noise(y) 18 | baseline_estimation = peaks.estimate_baseline(y, noise_estimation) 19 | start, apex, end = peaks.detect_peaks(y, noise_estimation, baseline_estimation) 20 | peaks = [Peak(s, a, p) for s, a, p in zip(start, apex, end)] 21 | fig, ax = plt.subplots() 22 | ax.plot(x, y) 23 | for p in peaks: 24 | ax.fill_between(x[p.start:p.end], y[p.start:p.end], alpha=0.25) 25 | -------------------------------------------------------------------------------- /docs/plots/peak_detection_baseline_example.py: -------------------------------------------------------------------------------- 1 | import tidyms as ms 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | np.random.seed(1234) 6 | signal_height = 100 7 | snr = 10 8 | n_col = 4 9 | x = np.arange(200) 10 | noise_level = signal_height / snr 11 | noise = np.random.normal(size=x.size, scale=noise_level) 12 | fig, ax = plt.subplots( 13 | nrows=3, ncols=n_col, figsize=(12, 12), sharex=True, sharey=True) 14 | 15 | # first row: one peak, different baselines 16 | row = 0 17 | baselines = [4, ms.utils.gauss(x, 100, 40, 20), x ** 2 * 0.002, 18 | np.sin(x * np.pi / 400) * 50] 19 | for col in range(n_col): 20 | signal = ms.utils.gauss(x, 100, 3, signal_height) 21 | y = signal + noise 22 | noise_estimation = ms.peaks.estimate_noise(y) 23 | ys = ms.lcms.gaussian_filter1d(y, 1) 24 | baseline_estimation = ms.peaks.estimate_baseline(ys, noise_estimation) 25 | start, apex, end = ms.peaks.detect_peaks( 26 | ys, noise_estimation, baseline_estimation) 27 | peaks = [ms.lcms.Peak(s, a, p) for (s, a, p) in zip(start, apex, end)] 28 | ax[row, col].plot(x, y) 29 | ax[row, col].plot(x, baseline_estimation) 30 | for p in peaks: 31 | ax[row, col].fill_between(x[p.start:p.end + 1], 32 | baseline_estimation[p.start:p.end + 1], 33 | y[p.start:p.end + 1], alpha=0.25) 34 | 35 | # second row: two peaks, same baselines as first row 36 | row = 1 37 | for col in range(n_col): 38 | gaussian_params = np.array([[100, 3, signal_height], 39 | [110, 3, signal_height]]) 40 | signal = ms.utils.gaussian_mixture(x, gaussian_params).sum(axis=0) 41 | y = signal + baselines[col] + noise 42 | noise_estimation = ms.peaks.estimate_noise(y) 43 | ys = ms.lcms.gaussian_filter1d(y, 1) 44 | baseline_estimation = ms.peaks.estimate_baseline(ys, noise_estimation) 45 | start, apex, end = ms.peaks.detect_peaks( 46 | ys, noise_estimation, baseline_estimation) 47 | peaks = [ms.lcms.Peak(s, a, p) for (s, a, p) in zip(start, apex, end)] 48 | ax[row, col].plot(x, y) 49 | ax[row, col].plot(x, baseline_estimation) 50 | for p in peaks: 51 | ax[row, col].fill_between(x[p.start:p.end + 1], 52 | baseline_estimation[p.start:p.end + 1], 53 | y[p.start:p.end + 1], alpha=0.25) 54 | 55 | # third row: different peak widths: 56 | row = 2 57 | widths = [3, 5, 7, 10] 58 | for col in range(n_col): 59 | w = widths[col] 60 | signal = ms.utils.gauss(x, 100, w, signal_height) 61 | y = signal + baselines[0] + noise 62 | noise_estimation = ms.peaks.estimate_noise(y) 63 | ys = ms.lcms.gaussian_filter1d(y, 1) 64 | baseline_estimation = ms.peaks.estimate_baseline(ys, noise_estimation) 65 | start, apex, end = ms.peaks.detect_peaks( 66 | ys, noise_estimation, baseline_estimation) 67 | peaks = [ms.lcms.Peak(s, a, p) for (s, a, p) in zip(start, apex, end)] 68 | ax[row, col].plot(x, y) 69 | ax[row, col].plot(x, baseline_estimation) 70 | for p in peaks: 71 | ax[row, col].fill_between(x[p.start:p.end + 1], 72 | baseline_estimation[p.start:p.end + 1], 73 | y[p.start:p.end + 1], alpha=0.25) 74 | -------------------------------------------------------------------------------- /docs/plots/roi-definition.py: -------------------------------------------------------------------------------- 1 | import tidyms as ms 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # always generate the same plot 6 | np.random.seed(1234) 7 | grid = np.arange(50) 8 | signal = ms.utils.gauss(grid, 25, 2, 30) 9 | noise = np.random.normal(size=signal.size, scale=1) 10 | x = signal + noise + 3 11 | mz_mean = 203.08215 12 | mz = np.random.normal(size=signal.size, scale=0.0005) + mz_mean 13 | 14 | fig, ax = plt.subplots(figsize=(6, 6), nrows=2, sharex=True) 15 | ax[1].plot(grid, x) 16 | ax[1].set_ylabel("Intensity") 17 | ax[1].set_xlabel("Retention Time") 18 | ax[0].plot(grid, mz) 19 | ax[0].set_ylabel("m/z") 20 | ax[0].set_ylim(mz_mean - 0.0025, mz_mean + 0.0025) 21 | -------------------------------------------------------------------------------- /docs/preprocessing-steps.csv: -------------------------------------------------------------------------------- 1 | #,name,description 2 | 1,Feature Detection,"Regions of interest (ROI) are detected in each sample." 3 | 2,Feature Extraction,"Features are extracted from each ROI." 4 | 3,Feature description,"A table of feature descriptors is built for each sample." 5 | 4,Feature table construction,"A feature table for all samples is built" 6 | 5,Feature matching,"Features found in different samples are grouped if they have a common identity." 7 | 6,Data matrix creation,"The data matrix is created using the feature table." -------------------------------------------------------------------------------- /docs/quickstart.rst: -------------------------------------------------------------------------------- 1 | .. _quickstart: 2 | 3 | .. py:currentmodule:: tidyms 4 | 5 | Quickstart 6 | ========== 7 | 8 | TidyMS [1]_ is a Python package that provides tools to process and analyze 9 | Mass Spectrometry (MS) data. Although suited for general use, it was designed 10 | to be used with datasets from LC-HRMS metabolomics experiments. It uses 11 | `Numpy `_, `Pandas `_ and 12 | `scikit-learn `_ for data processing and analysis. 13 | Some of the functionality that offers is: 14 | 15 | - read raw data in the mzML format using :class:`tidyms.MSData` class, optimized for speed and low memory usage. 16 | - Creation of chromatograms and accumulated spectra from raw data. 17 | - :term:`Feature detection` and :term:`feature correspondence` in metabolomics datasets using the :class:`tidyms.Assay` class. 18 | - Read processed data from other mass spectrometry processing software (XCMS, mzmine2, etc...). 19 | - A container object to manage metabolomics data. 20 | - :term:`Data curation` of untargeted metabolomics data sets using widely accepted practices from the metabolomics community [2]_ [3]_ 21 | - Interactive data visualization using `bokeh `_, or publication quality plots using `seaborn `_. 22 | 23 | In the rest of this guide, you can find links for different use cases for the 24 | TidyMS package. A basic knowledge of MS and metabolomics is assumed, but you can 25 | look up in the :doc:`glossary` the concepts used in the guides. 26 | Installation instructions are available :doc:`here`. 27 | 28 | You can refer to the following guides to learn about specific topics: 29 | 30 | - :ref:`Working with raw data ` 31 | - :ref:`Processing complete datasets from raw data ` 32 | - :ref:`Curation of a metabolomics data matrix ` 33 | - :ref:`Feature detection and extraction algorithms ` 34 | - :ref:`Feature correspondence algorithm ` 35 | - :ref:`Converting proprietary instrument-specific formats into mzML ` 36 | 37 | 38 | References 39 | ---------- 40 | 41 | .. [1] Riquelme, G. *et al*, "A Python-Based Pipeline for Preprocessing LC–MS 42 | Data for Untargeted Metabolomics Workflows". Metabolites 2020, 10, 416. 43 | https://doi.org/10.3390/metabo10100416 44 | 16, 1, (2015), Pages 104–117, https://doi.org/10.1093/bib/bbt080 45 | .. [2] W B Dunn *et al*, "Procedures for large-scale metabolic profiling of 46 | serum and plasma using gas chromatography and liquid chromatography 47 | coupled to mass spectrometry", Nature Protocols volume 6, pages 48 | 1060–1083 (2011). 49 | .. [3] D Broadhurst *et al*, "Guidelines and considerations for the use of 50 | system suitability and quality control samples in mass spectrometry assays 51 | applied in untargeted clinical metabolomic studies.", Metabolomics, 52 | 2018;14(6):72. doi: 10.1007/s11306-018-1367-3 -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | bokeh 2 | numpydoc 3 | sphinx 4 | tidyms 5 | sphinx_rtd_theme -------------------------------------------------------------------------------- /docs/tutorials.rst: -------------------------------------------------------------------------------- 1 | .. _tutorials: 2 | 3 | .. py:currentmodule:: tidyms 4 | 5 | Tutorials 6 | ========= 7 | 8 | In this section there is available a list of tutorials on specific topics. 9 | 10 | * :ref:`Converting files to mzML ` 11 | * :ref:`Working with raw data ` 12 | * :ref:`Processing complete datasets ` 13 | * :ref:`Feature detection ` 14 | * :ref:`Feature correspondence ` 15 | * :ref:`Working with chemical formulas ` -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "tidyms" 7 | version = "0.7.0" 8 | description = "Tools for working with MS data in metabolomics" 9 | authors = [ 10 | { name = "Gabriel Riquelme" }, 11 | ] 12 | readme = "README.md" 13 | license = {file = "LICENSE"} 14 | classifiers = [ 15 | "Programming Language :: Python :: 3.9", 16 | "Programming Language :: Python :: 3.10", 17 | "License :: OSI Approved :: BSD License", 18 | "Topic :: Scientific/Engineering :: Bio-Informatics", 19 | "Topic :: Scientific/Engineering :: Chemistry", 20 | "Topic :: Scientific/Engineering :: Medical Science Apps." 21 | ] 22 | dependencies = [ 23 | "beautifulsoup4>=4.11.2", 24 | "bokeh>=3.0", 25 | "Cerberus>=1.3", 26 | "dill>=0.3.6", 27 | "ipython>=8.1", 28 | "joblib>=1.1", 29 | "matplotlib>=3.5.1", 30 | "natsort>=8.2.0", 31 | "networkx>=3.0", 32 | "numpy>=1.22", 33 | "openpyxl>=3.0", 34 | "pandas>=1.5.3", 35 | "plotnine>=0.10.1", 36 | "requests", 37 | "scikit-learn>=1.0.2", 38 | "scipy>=1.8", 39 | "seaborn>=0.11", 40 | "statsmodels>=0.13", 41 | "tqdm>=4.0", 42 | "umap-learn>=0.5.3", 43 | "xlrd>=2.0" 44 | ] 45 | requires-python = ">=3.9" 46 | 47 | [project.urls] 48 | Homepage = "https://github.com/griquelme/tidyms" 49 | 50 | [tool.mypy] 51 | python_version = "3.9" 52 | files = ["src/tidyms"] 53 | 54 | [tool.pytest.ini_options] 55 | pythonpath = [ 56 | ".", "./src" 57 | ] -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bokeh>=3.0.3 2 | Cerberus>=1.3 3 | ipython>=8.1 4 | joblib>=1.1 5 | matplotlib>=3.5.1 6 | networkx>=3.0 7 | numpy>=1.22 8 | openpyxl>=3.0 9 | pandas>=1.5.3 10 | requests 11 | scikit-learn>=1.0.2 12 | scipy>=1.8 13 | seaborn>=0.12 14 | statsmodels>=0.13 15 | tqdm>=4.0 16 | xlrd>=2.0 17 | 18 | plotnine>=0.10.1 19 | natsort>=8.2.0 20 | beautifulsoup4>=4.11.2 21 | dill>=0.3.6 22 | umap-learn>=0.5.3 -------------------------------------------------------------------------------- /src/tidyms/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | TidyMS 3 | ====== 4 | 5 | A package to work with Mass Spectrometry data from Metabolomics Experiments. 6 | 7 | Provides 8 | 1. The Assay object to process datasets from raw data. 9 | 2. The MSData object to work with raw data. 10 | 3. The DataContainer object to store metabolomics data sets. 11 | 4. Pipeline and Processor objects to perform curation of data sets. 12 | 13 | """ 14 | 15 | from . import chem 16 | from . import fileio 17 | from . import container 18 | from . import utils 19 | from . import peaks 20 | from . import filter 21 | from . import lcms 22 | from . import simulation 23 | from . import raw_data_utils 24 | from . import _mzml 25 | from . import _build_data_matrix 26 | from . import correspondence 27 | from . import fill_missing 28 | from . import consensus_annotation 29 | from .container import DataContainer 30 | from .fileio import MSData 31 | from .lcms import Chromatogram, MSSpectrum 32 | from .assay import Assay 33 | from .raw_data_utils import * 34 | from . import dartms 35 | from .annotation import annotation 36 | 37 | utils.create_tidyms_dir() 38 | SETTINGS = utils.get_settings() 39 | 40 | if SETTINGS["bokeh"]["apply_theme"]: 41 | from bokeh.themes import Theme as _Theme 42 | from bokeh.io import curdoc as _curdoc 43 | theme = SETTINGS["bokeh"]["theme"] 44 | _curdoc().theme = _Theme(json=theme) 45 | 46 | if utils.is_notebook(): 47 | from bokeh.plotting import output_notebook as _output_notebook 48 | 49 | _output_notebook() 50 | -------------------------------------------------------------------------------- /src/tidyms/_constants.py: -------------------------------------------------------------------------------- 1 | from typing import Final, List 2 | 3 | 4 | # separation modes 5 | HPLC: Final[str] = "hplc" 6 | UPLC: Final[str] = "uplc" 7 | DART: Final[str] = "None/DART" 8 | LC_MODES: Final[List[str]] = [UPLC, HPLC, DART] 9 | SEPARATION_MODES: Final[List[str]] = LC_MODES + [] 10 | 11 | # instruments 12 | QTOF: Final[str] = "qtof" 13 | ORBITRAP: Final[str] = "orbitrap" 14 | MS_INSTRUMENTS: Final[List[str]] = [QTOF, ORBITRAP] 15 | 16 | # MS mode 17 | CENTROID: Final[str] = "centroid" 18 | PROFILE: Final[str] = "profile" 19 | MS_MODES: Final[List[str]] = [CENTROID, PROFILE] 20 | 21 | # Data loading 22 | MEMORY: Final[str] = "memory" 23 | INFILE: Final[str] = "file" 24 | SIMULATED: Final[str] = "simulated" 25 | DATA_LOAD_MODES: Final[List[str]] = [MEMORY, INFILE, SIMULATED] 26 | DEFAULT_DATA_LOAD_MODE = INFILE 27 | 28 | # feature descriptors 29 | FEATURE: Final[str] = "feature" 30 | MZ: Final[str] = "mz" 31 | RT_START: Final[str] = "rt start" 32 | RT_END: Final[str] = "rt end" 33 | RT: Final[str] = "rt" 34 | RT_STD: Final[str] = "rt std" 35 | AREA: Final[str] = "area" 36 | WIDTH: Final[str] = "width" 37 | HEIGHT: Final[str] = "height" 38 | SNR: Final[str] = "snr" 39 | MZ_STD: Final[str] = "mz_std" 40 | ROI_INDEX: Final[str] = "roi_index" 41 | FT_INDEX: Final[str] = "ft_index" 42 | MERGED: Final[str] = "merged" 43 | 44 | # chromatogram names 45 | BASELINE: Final[str] = "baseline" 46 | NOISE: Final[str] = "noise" 47 | SPINT: Final[str] = "spint" # spectral intensity 48 | ROI_FEATURE_LIST: Final[str] = "features" 49 | TIME: Final[str] = "time" 50 | SCAN: Final[str] = "scan" 51 | MODE: Final[str] = "mode" 52 | 53 | # peak names 54 | START: Final[str] = "start" 55 | APEX: Final[str] = "apex" 56 | END: Final[str] = "end" 57 | 58 | # isotopologue envelope annotation 59 | ENVELOPE_LABEL: Final[str] = "envelope_label" 60 | ENVELOPE_INDEX: Final[str] = "envelope_index" 61 | CHARGE: Final[str] = "charge" 62 | 63 | # sample metadata 64 | SAMPLE: Final[str] = "sample" 65 | CLASS: Final[str] = "class" 66 | ORDER: Final[str] = "order" 67 | BATCH: Final[str] = "batch" 68 | LABEL: Final[str] = "cluster" 69 | ID: Final[str] = "id" 70 | DILUTION: Final[str] = "dilution" 71 | TYPE: Final[str] = "type" 72 | 73 | # sample types 74 | QC_TYPE: Final[str] = "qc" 75 | DQC_TYPE: Final[str] = "dqc" 76 | STUDY_TYPE: Final[str] = "sample" 77 | BLANK_TYPE: Final[str] = "blank" 78 | SAMPLE_TYPES: Final[list[str]] = [QC_TYPE, STUDY_TYPE, BLANK_TYPE, DQC_TYPE] 79 | 80 | 81 | # assay file and dir names 82 | ROI_DIR: Final[str] = "roi" 83 | FT_DIR: Final[str] = "feature" 84 | MANAGER_FILENAME: Final[str] = "metadata.pickle" 85 | FT_TABLE_FILENAME: Final[str] = "feature-table.pickle" 86 | DATA_MATRIX_FILENAME: Final[str] = "data-matrix.pickle" 87 | 88 | # preprocessing steps 89 | DETECT_FEATURES: Final[str] = "detect_features" 90 | EXTRACT_FEATURES: Final[str] = "extract_features" 91 | DESCRIBE_FEATURES: Final[str] = "describe_features" 92 | ANNOTATE_ISOTOPOLOGUES: Final[str] = "annotate_isotopologues" 93 | ANNOTATE_ADDUCTS: Final[str] = "annotate_adducts" 94 | BUILD_FEATURE_TABLE: Final[str] = "build_feature_table" 95 | MATCH_FEATURES: Final[str] = "match_features" 96 | MAKE_DATA_MATRIX: Final[str] = "make_data_matrix" 97 | FILL_MISSING: Final[str] = "fill_missing" 98 | 99 | PREPROCESSING_STEPS: Final[List[str]] = [ 100 | DETECT_FEATURES, 101 | EXTRACT_FEATURES, 102 | DESCRIBE_FEATURES, 103 | ANNOTATE_ISOTOPOLOGUES, 104 | ANNOTATE_ADDUCTS, 105 | BUILD_FEATURE_TABLE, 106 | MATCH_FEATURES, 107 | MAKE_DATA_MATRIX, 108 | FILL_MISSING, 109 | ] 110 | -------------------------------------------------------------------------------- /src/tidyms/_plot_bokeh.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import bokeh.plotting 3 | from bokeh.palettes import all_palettes 4 | from bokeh.models import ColumnDataSource, Segment 5 | from .utils import get_settings 6 | from . import _constants as c 7 | from typing import Dict, Generator, List, Optional 8 | 9 | 10 | def get_bokeh_settings(): 11 | return get_settings()["bokeh"] 12 | 13 | 14 | def get_theme_params() -> dict: 15 | return get_bokeh_settings()["theme"] 16 | 17 | 18 | def get_line_params() -> dict: 19 | return get_bokeh_settings()["line"] 20 | 21 | 22 | def get_chromatogram_figure_params() -> dict: 23 | return get_bokeh_settings()["chromatogram"]["figure"] 24 | 25 | 26 | def get_spectrum_figure_params() -> dict: 27 | return get_bokeh_settings()["spectrum"]["figure"] 28 | 29 | 30 | def get_varea_params() -> dict: 31 | return get_bokeh_settings()["varea"] 32 | 33 | 34 | def get_palette() -> List[str]: 35 | palette_params = get_bokeh_settings()["palette"] 36 | return find_palette(**palette_params) 37 | 38 | 39 | def make_figure(fig_params: Optional[dict]): 40 | if fig_params is None: 41 | fig_params = dict() 42 | return bokeh.plotting.figure(**fig_params) 43 | 44 | 45 | def find_palette(name: str, size: Optional[int] = None) -> List[str]: 46 | try: 47 | palette = bokeh.palettes.all_palettes[name] 48 | # by default get the palette with the largest size 49 | if size is None: 50 | size = max(list(palette.keys())) 51 | palette = palette[size] 52 | except KeyError: 53 | link = "https://docs.bokeh.org/en/latest/docs/reference/palettes.html" 54 | msg = "Palette not found. Refer to the list of prebuilt palettes at {}" 55 | raise ValueError(msg.format(link)) 56 | return palette 57 | 58 | 59 | def palette_cycler(palette: List[str]) -> Generator[str, None, None]: 60 | ind = 0 61 | size = len(palette) 62 | while True: 63 | yield palette[ind] 64 | ind = (ind + 1) % size 65 | 66 | 67 | def add_line( 68 | figure: bokeh.plotting.figure, 69 | x: np.ndarray, 70 | y: np.ndarray, 71 | line_params: Optional[dict] = None 72 | ): 73 | """ 74 | Plots a line. 75 | 76 | Parameters 77 | ---------- 78 | figure : bokeh.plotting.figure 79 | key-value parameters to pass into bokeh figure function. 80 | x : array 81 | y : array 82 | line_params : dict 83 | key-value parameters to pass into bokeh line function. 84 | 85 | """ 86 | default_line_params = get_line_params() 87 | if line_params: 88 | default_line_params.update(line_params) 89 | line_params = default_line_params 90 | figure.line(x, y, **line_params) 91 | 92 | 93 | def set_chromatogram_axis_params(fig: bokeh.plotting.figure): 94 | bokeh_settings = get_bokeh_settings() 95 | xaxis_params = bokeh_settings["chromatogram"]["xaxis"] 96 | yaxis_params = bokeh_settings["chromatogram"]["yaxis"] 97 | fig.xaxis.update(**xaxis_params) 98 | fig.yaxis.update(**yaxis_params) 99 | 100 | 101 | def set_ms_spectrum_axis_params(fig: bokeh.plotting.figure): 102 | bokeh_settings = get_bokeh_settings() 103 | xaxis_params = bokeh_settings["spectrum"]["xaxis"] 104 | yaxis_params = bokeh_settings["spectrum"]["yaxis"] 105 | fig.xaxis.update(**xaxis_params) 106 | fig.yaxis.update(**yaxis_params) 107 | 108 | 109 | def fill_area( 110 | figure: bokeh.plotting.figure, 111 | x: np.ndarray, 112 | y: np.ndarray, 113 | start: int, 114 | end: int, 115 | color: str, 116 | **varea_params, 117 | ): 118 | default_varea_params = get_varea_params() 119 | if varea_params: 120 | default_varea_params.update(varea_params) 121 | varea_params = default_varea_params 122 | 123 | xp = x[start:end] 124 | yp = y[start:end] 125 | figure.varea(xp, yp, 0, fill_color=color, **varea_params) 126 | 127 | 128 | def add_stems( 129 | fig: bokeh.plotting.figure, 130 | x: np.ndarray, 131 | y: np.ndarray, 132 | line_params: Optional[Dict] = None 133 | ): 134 | default_line_params = get_line_params() 135 | if line_params: 136 | default_line_params.update(line_params) 137 | line_params = default_line_params 138 | x0 = x 139 | y0 = np.zeros_like(y) 140 | source = ColumnDataSource(dict(x0=x0, x1=x, y0=y0, y1=y)) 141 | stems = Segment(x0="x0", x1="x1", y0="y0", y1="y1", **line_params) 142 | fig.add_glyph(source, stems) 143 | 144 | 145 | class _LCAssayPlotter: # pragma: no cover 146 | """ 147 | Methods to plot data from an Assay. Generates Bokeh Figures. 148 | 149 | Methods 150 | ------- 151 | roi(sample: str) : 152 | m/z vs Rt view of the ROI and features in a sample. 153 | stacked_chromatogram(feature: int) : 154 | Overlapped chromatographic peaks for a feature in all samples 155 | 156 | """ 157 | def __init__(self, assay): 158 | self.assay = assay 159 | self.roi_index = None 160 | self.ft_index = None 161 | 162 | def _build_roi_index_table(self): 163 | ft_table = self.assay.feature_table.copy() 164 | ft_table = ft_table[ft_table[c.LABEL] > -1] 165 | self.roi_index = ( 166 | ft_table.pivot(index=c.SAMPLE, columns=c.LABEL, values=c.ROI_INDEX) 167 | .fillna(-1) 168 | .astype(int) 169 | ) 170 | 171 | def _build_peak_index_table(self): 172 | ft_table = self.assay.feature_table.copy() 173 | ft_table = ft_table[ft_table[c.LABEL] > -1] 174 | self.ft_index = ( 175 | ft_table.pivot(index=c.SAMPLE, columns=c.LABEL, values=c.FT_INDEX) 176 | .fillna(-1) 177 | .astype(int) 178 | ) 179 | 180 | def roi(self, sample: str, show: bool = True) -> bokeh.plotting.figure: 181 | """ 182 | Plots m/z vs time dispersion of the ROI in a sample. Detected features 183 | are highlighted using circles. 184 | 185 | Parameters 186 | ---------- 187 | sample : str 188 | sample used in the Assay. 189 | show : bool, default=True 190 | If True calls ``bokeh.plotting.show`` on the Figure. 191 | 192 | Returns 193 | ------- 194 | bokeh Figure 195 | """ 196 | roi = self.assay.load_roi_list(sample) 197 | 198 | TOOLTIPS = [ 199 | ("m/z", "@{}".format(c.MZ)), 200 | ("Rt", "@{}".format(c.RT)), 201 | ("area", "@{}".format(c.AREA)), 202 | ("height", "@{}".format(c.HEIGHT)), 203 | ("width", "@{}".format(c.WIDTH)), 204 | ("SNR", "@{}".format(c.SNR)), 205 | ("roi index", "@{}".format(c.ROI_INDEX)), 206 | ("feature index", "@{}".format(c.FT_INDEX)) 207 | ] 208 | fig = bokeh.plotting.figure(tooltips=TOOLTIPS) 209 | 210 | rt_list = list() 211 | mz_list = list() 212 | for r in roi: 213 | rt_list.append(r.time) 214 | mz_list.append(r.mz) 215 | line_source = bokeh.plotting.ColumnDataSource( 216 | dict(xs=rt_list, ys=mz_list) 217 | ) 218 | line_params = get_line_params() 219 | fig.multi_line(xs="xs", ys="ys", source=line_source, **line_params) 220 | 221 | try: 222 | ft = self.assay.load_features(sample) 223 | source = bokeh.plotting.ColumnDataSource(ft) 224 | fig.circle('rt', 'mz', size=5, source=source) 225 | except ValueError: 226 | pass 227 | fig.xaxis.update(axis_label="Rt [s]") 228 | fig.yaxis.update(axis_label="m/z") 229 | if show: 230 | bokeh.plotting.show(fig) 231 | return fig 232 | 233 | def stacked_chromatogram( 234 | self, 235 | cluster: int, 236 | include_classes: Optional[List[str]] = None, 237 | show: bool = True 238 | ) -> bokeh.plotting.figure: 239 | """ 240 | Plots chromatograms of a feature detected across different samples. 241 | 242 | Parameters 243 | ---------- 244 | cluster : int 245 | cluster value obtained from feature correspondence. 246 | include_classes : List[str] or None, default=None 247 | List of classes to plot. If None is used, samples from all classes 248 | are plotted. 249 | show : bool, default=True 250 | If True calls ``bokeh.plotting.show`` on the Figure. 251 | 252 | Returns 253 | ------- 254 | bokeh Figure 255 | 256 | """ 257 | if not self.assay.manager.check_step("match_features"): 258 | msg = "This plot only can be generated after feature matching" 259 | raise ValueError(msg) 260 | else: 261 | if self.ft_index is None: 262 | self._build_peak_index_table() 263 | 264 | if self.roi_index is None: 265 | self._build_roi_index_table() 266 | 267 | fig_params = get_chromatogram_figure_params() 268 | fig = bokeh.plotting.figure(**fig_params) 269 | roi_index = self.roi_index[cluster].to_numpy() 270 | ft_index = self.ft_index[cluster].to_numpy() 271 | samples = self.roi_index.index 272 | # TODO: fix after refactoring DataContainers 273 | classes = self.assay.get_sample_metadata()["class"] 274 | palette = get_palette() 275 | if include_classes is not None: 276 | class_to_color = dict() 277 | for k, cl in enumerate(include_classes): 278 | class_to_color[cl] = palette[k] 279 | 280 | iterator = zip(samples, roi_index, ft_index, classes) 281 | for sample, roi_index, ft_index, class_ in iterator: 282 | check_draw = ( 283 | (roi_index > -1) and 284 | ((include_classes is None) or (class_ in include_classes)) 285 | ) 286 | if check_draw: 287 | if include_classes is None: 288 | color = palette[0] 289 | else: 290 | color = class_to_color[class_] 291 | r = self.assay.load_roi(sample, roi_index) 292 | ft = r.features[ft_index] 293 | add_line(fig, r.time, r.spint) 294 | fill_area( 295 | fig, r.time, r.spint, ft.start, ft.end, color, alpha=0.2) 296 | set_chromatogram_axis_params(fig) 297 | if show: 298 | bokeh.plotting.show(fig) 299 | return fig 300 | -------------------------------------------------------------------------------- /src/tidyms/annotation/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Annotation 3 | ---------- 4 | 5 | Tools for feature annotation. 6 | 7 | Provides: 8 | 9 | 1. Tools for isotopologue annotation. 10 | 11 | Functions 12 | --------- 13 | annotate 14 | create_annotation_table 15 | create_annotation_tools 16 | 17 | 18 | """ 19 | 20 | from .annotation import annotate, create_annotation_table, create_annotation_tools 21 | 22 | __all__ = ["annotate", "create_annotation_tools", "create_annotation_table"] 23 | -------------------------------------------------------------------------------- /src/tidyms/annotation/annotation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from typing import Sequence 4 | from .annotation_data import AnnotationData 5 | from .envelope_finder import EnvelopeFinder 6 | from .mmi_finder import MMIFinder 7 | from ..lcms import Feature 8 | from ..chem import EnvelopeValidator 9 | from ..chem.atoms import EM, PeriodicTable 10 | from .. import _constants as c 11 | 12 | 13 | def create_annotation_table(feature_list: list[Feature]) -> pd.DataFrame: 14 | d: dict[str, list[int]] = { 15 | c.ROI_INDEX: list(), 16 | c.FT_INDEX: list(), 17 | c.ENVELOPE_INDEX: list(), 18 | c.ENVELOPE_LABEL: list(), 19 | c.CHARGE: list(), 20 | } 21 | 22 | for ft in feature_list: 23 | annotation = ft.annotation 24 | d[c.CHARGE].append(annotation.charge) 25 | d[c.ENVELOPE_INDEX].append(annotation.isotopologue_index) 26 | d[c.ENVELOPE_LABEL].append(annotation.isotopologue_label) 27 | d[c.ROI_INDEX].append(ft.roi.index) 28 | d[c.FT_INDEX].append(ft.index) 29 | 30 | return pd.DataFrame(d) 31 | 32 | 33 | def create_annotation_tools( 34 | bounds: dict[str, tuple[int, int]], 35 | max_mass: float, 36 | max_charge: int, 37 | max_length: int, 38 | min_M_tol: float, 39 | max_M_tol: float, 40 | p_tol: float, 41 | min_similarity: float, 42 | min_p: float, 43 | ) -> tuple[MMIFinder, EnvelopeFinder, EnvelopeValidator]: 44 | """ 45 | Create an annotator object. Auxiliary function to _annotate 46 | 47 | Parameters 48 | ---------- 49 | bounds : Dict 50 | A dictionary of expected elements to minimum and maximum formula coefficients. 51 | max_mass : float 52 | Maximum exact mass of the features. 53 | max_charge : int 54 | Maximum charge of the features. Use negative values for negative polarity. 55 | max_length : int 56 | Maximum length of the envelopes. 57 | min_M_tol : float 58 | Minimum mass tolerance used during search. isotopologues with abundance 59 | equal to 1 use this value. Isotopologues with abundance equal to 0 use 60 | `max_M_tol`. For values in between, a weighted tolerance is used based 61 | on the abundance. 62 | max_M_tol : float 63 | p_tol : float 64 | Abundance tolerance. 65 | min_similarity : float 66 | Minimum cosine similarity between a pair of features 67 | min_p : float 68 | Minimum abundance of isotopes to include in candidate search. 69 | 70 | Returns 71 | ------- 72 | annotator: _IsotopologueAnnotator 73 | 74 | """ 75 | # remove elements with only 1 stable isotope 76 | p_table = PeriodicTable() 77 | bounds = {k: bounds[k] for k in bounds if len(p_table.get_element(k).isotopes) > 1} 78 | 79 | bin_size = 100 80 | elements = list(bounds) 81 | mmi_finder = MMIFinder( 82 | bounds, 83 | max_mass, 84 | max_charge, 85 | max_length, 86 | bin_size, 87 | max_M_tol, 88 | p_tol, 89 | min_similarity, 90 | ) 91 | envelope_finder = EnvelopeFinder(elements, max_M_tol, max_length, min_p, min_similarity) 92 | envelope_validator = EnvelopeValidator( 93 | bounds, 94 | max_M=max_mass, 95 | max_length=max_length, 96 | min_M_tol=min_M_tol, 97 | max_M_tol=max_M_tol, 98 | p_tol=p_tol, 99 | ) 100 | return mmi_finder, envelope_finder, envelope_validator 101 | 102 | 103 | def annotate( 104 | feature_list: list[Feature], 105 | mmi_finder: MMIFinder, 106 | envelope_finder: EnvelopeFinder, 107 | envelope_validator: EnvelopeValidator, 108 | ) -> None: 109 | """ 110 | Annotate isotopologues in a sample. 111 | 112 | Annotations are added to the `annotation` attribute of each feature. 113 | 114 | Parameters 115 | ---------- 116 | feature_list : List[LCTrace] 117 | List of features obtained after feature extraction. 118 | mmi_finder : MMIFinder 119 | envelope_finder : EnvelopeFinder 120 | envelope_validator : EnvelopeValidator 121 | 122 | """ 123 | data = AnnotationData(feature_list) 124 | monoisotopologue = data.get_monoisotopologue() 125 | polarity = mmi_finder.polarity 126 | while monoisotopologue is not None: 127 | mmi_candidates = mmi_finder.find(data) 128 | envelope, charge = find_best_envelope( 129 | data, 130 | monoisotopologue, 131 | polarity, 132 | mmi_candidates, 133 | envelope_finder, 134 | envelope_validator, 135 | ) 136 | data.annotate(envelope, charge) 137 | monoisotopologue = data.get_monoisotopologue() 138 | 139 | 140 | def find_best_envelope( 141 | data: AnnotationData, 142 | monoisotopologue: Feature, 143 | polarity: int, 144 | mmi_candidates: Sequence[tuple[Feature, int]], 145 | envelope_finder: EnvelopeFinder, 146 | envelope_validator: EnvelopeValidator, 147 | ) -> tuple[Sequence[Feature], int]: 148 | best_length = 1 149 | best_candidate = [monoisotopologue] 150 | best_charge = -1 151 | for mmi, charge in mmi_candidates: 152 | envelope_candidates = envelope_finder.find(data, mmi, charge) 153 | for candidate in envelope_candidates: 154 | validated_length = _validate_candidate( 155 | candidate, 156 | monoisotopologue, 157 | charge, 158 | polarity, 159 | best_length, 160 | envelope_validator, 161 | ) 162 | if validated_length > best_length: 163 | best_length = validated_length 164 | best_candidate = candidate[:validated_length] 165 | best_charge = charge 166 | return best_candidate, best_charge 167 | 168 | 169 | def _validate_candidate( 170 | candidate: Sequence[Feature], 171 | monoisotopologue: Feature, 172 | charge: int, 173 | polarity: int, 174 | min_length: int, 175 | validator: EnvelopeValidator, 176 | ) -> int: 177 | if len(candidate) <= min_length: 178 | return 0 179 | 180 | if monoisotopologue not in candidate: 181 | return 0 182 | 183 | M, p = candidate[0].compute_isotopic_envelope(candidate) 184 | em_correction = EM * charge * polarity 185 | M = np.array(M) * charge - em_correction 186 | p = np.array(p) 187 | return validator.validate(M, p) 188 | -------------------------------------------------------------------------------- /src/tidyms/annotation/annotation_data.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from ..lcms import Feature 3 | from collections.abc import Sequence 4 | 5 | 6 | class AnnotationData: 7 | """ 8 | Feature data. 9 | 10 | Attributes 11 | ---------- 12 | features : list[Feature] 13 | List of features sorted by m/z 14 | annotation : dict[Feature, int] 15 | Annotation of features 16 | similarity_cache : SimilarityCache 17 | Stores similarity between features. 18 | non_annotated : set[Feature] 19 | Non-annotated features. 20 | 21 | """ 22 | 23 | def __init__(self, features: Sequence[Feature]): 24 | self.features = sorted(features) 25 | self.non_annotated = set(features) 26 | self._monoisotopologues = sorted(features, key=lambda x: x.height) 27 | self.similarity_cache = SimilarityCache() 28 | self._label_counter = 0 29 | 30 | def get_monoisotopologue(self) -> Optional[Feature]: 31 | """Gets the current non-annotated feature with the greatest area.""" 32 | if self._monoisotopologues: 33 | mono = self._monoisotopologues[-1] 34 | while mono not in self.non_annotated: 35 | self._monoisotopologues.pop() 36 | if self._monoisotopologues: 37 | mono = self._monoisotopologues[-1] 38 | else: 39 | mono = None 40 | else: 41 | mono = None 42 | return mono 43 | 44 | def annotate(self, features: Sequence[Feature], charge: int): 45 | """Labels a list of features as an isotopic envelope.""" 46 | if len(features) > 1: 47 | for k, ft in enumerate(features): 48 | ft.annotation.charge = charge 49 | ft.annotation.isotopologue_label = self._label_counter 50 | ft.annotation.isotopologue_index = k 51 | self._flag_annotated(ft) 52 | self._label_counter += 1 53 | else: 54 | self._flag_annotated(features[0]) 55 | 56 | def _flag_annotated(self, feature: Feature): 57 | """Flag features as annotated.""" 58 | self.non_annotated.discard(feature) 59 | if self._monoisotopologues and (feature == self._monoisotopologues[-1]): 60 | self._monoisotopologues.pop() 61 | 62 | 63 | class SimilarityCache: 64 | """Stores and retrieves the similarity between features in a sample.""" 65 | 66 | def __init__(self): 67 | self._cache: dict[Feature, dict[Feature, float]] = dict() 68 | 69 | def get_similarity(self, ft1: Feature, ft2: Feature): 70 | ft1_sim = self._cache.setdefault(ft1, dict()) 71 | ft2_sim = self._cache.setdefault(ft2, dict()) 72 | if ft2 in ft1_sim: 73 | similarity = ft1_sim[ft2] 74 | else: 75 | similarity = ft1.compare(ft2) 76 | ft1_sim[ft2] = similarity 77 | ft2_sim[ft1] = similarity 78 | return similarity 79 | -------------------------------------------------------------------------------- /src/tidyms/annotation/envelope_finder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions to find isotopic envelopes candidates in a list of m/z values. 3 | 4 | """ 5 | 6 | 7 | import bisect 8 | from typing import Tuple 9 | from ..chem.atoms import Element, PeriodicTable 10 | from ..lcms import Feature 11 | from .annotation_data import AnnotationData, SimilarityCache 12 | from collections.abc import Sequence 13 | 14 | # name conventions 15 | # M is used for Molecular mass 16 | # m for nominal mass 17 | # p for abundances 18 | 19 | 20 | class EnvelopeFinder(object): 21 | r""" 22 | Find isotopic envelopes candidates in a list of sorted m/z values. 23 | 24 | Attributes 25 | ---------- 26 | tolerance : float 27 | tolerance used to extend the element based bounds 28 | max_length : int 29 | max length of the envelopes 30 | 31 | Notes 32 | ----- 33 | Using a list of elements, theoretical bounds are computed for each M1, M2, 34 | M3, etc... isotopologue. Then using these values and the `mz_tolerance` and 35 | the `max_charge`, the bounds are adjusted according to the following 36 | equations: 37 | 38 | .. math:: 39 | 40 | mz_{k, min}= \frac{m_{k, min}{q} - mz_{tolerance} 41 | 42 | mz_{k, max}= \frac{m_{k, max}{q} + mz_{tolerance} 43 | 44 | where :math:`m_{k, min}` is the minimum theoretical value for the k-th 45 | isotopologue and q is the charge. 46 | 47 | The envelopes candidates found are determined based on m/z compatibility 48 | only. To reduce the number of candidates, the list of m/z values should be 49 | reduced by other means, such as correlation of the values. 50 | 51 | """ 52 | 53 | def __init__( 54 | self, 55 | elements: list[str], 56 | mz_tolerance: float, 57 | max_length: int = 5, 58 | min_p: float = 0.01, 59 | min_similarity: float = 0.9, 60 | ): 61 | """ 62 | 63 | Parameters 64 | ---------- 65 | elements : List[str] 66 | List of elements used to compute mass difference windows. 67 | mz_tolerance : float 68 | m/z tolerance used to match candidates. 69 | max_length : int, default=5 70 | Maximum envelope length to search. 71 | min_p : number between 0 and 1. 72 | The minimum abundance of the isotopes of each element to be used for m/z estimation. 73 | min_similarity : float, default=0.9 74 | Minimum similarity to create candidates. 75 | 76 | """ 77 | 78 | el_list = [PeriodicTable().get_element(x) for x in elements] 79 | self.tolerance = mz_tolerance 80 | self.max_length = max_length 81 | self.min_similarity = min_similarity 82 | self.bounds = _make_exact_mass_difference_bounds(el_list, min_p) 83 | 84 | def find( 85 | self, 86 | data: AnnotationData, 87 | mmi: Feature, 88 | charge: int, 89 | ) -> list[Sequence[Feature]]: 90 | """ 91 | Finds isotopic envelope candidates starting from the minimum mass 92 | isotopologue (MMI). 93 | 94 | Parameters 95 | ---------- 96 | data : AnnotationData 97 | List of features sorted by m/z. 98 | mmi : Feature 99 | Minimum Mass feature. 100 | non_annotated : set[Feature] 101 | Non annotated features 102 | charge : int 103 | Absolute value of the charge state of the isotopic envelope 104 | 105 | Returns 106 | ------- 107 | envelopes: list[list[Feature]] 108 | List of isotopic envelope candidates. 109 | 110 | """ 111 | envelopes = _find_envelopes( 112 | data.features, 113 | mmi, 114 | data.non_annotated, 115 | data.similarity_cache, 116 | charge, 117 | self.max_length, 118 | self.tolerance, 119 | self.min_similarity, 120 | self.bounds, 121 | ) 122 | envelopes = _remove_sub_candidates(envelopes) 123 | return envelopes 124 | 125 | 126 | def _remove_sub_candidates( 127 | candidates: list[Sequence[Feature]], 128 | ) -> list[Sequence[Feature]]: 129 | """Remove candidates that are subsets of other candidates.""" 130 | validated = list() 131 | while candidates: 132 | last = candidates.pop() 133 | last_set = set(last) 134 | is_subset = False 135 | for candidate in candidates: 136 | is_subset = last_set <= set(candidate) 137 | if not is_subset: 138 | validated.append(last) 139 | return validated 140 | 141 | 142 | def _find_envelopes( 143 | features: Sequence[Feature], 144 | mmi: Feature, 145 | non_annotated: set[Feature], 146 | cache: SimilarityCache, 147 | charge: int, 148 | max_length: int, 149 | mz_tolerance: float, 150 | min_similarity: float, 151 | bounds: dict[int, Tuple[float, float]], 152 | ) -> list[Sequence[Feature]]: 153 | """ 154 | 155 | Finds isotopic envelope candidates using multiple charge states. 156 | 157 | Parameters 158 | ---------- 159 | features: list[Feature] 160 | List of features sorted by m/z. 161 | mmi: Feature 162 | Minimum Mass feature. 163 | non_annotated: set[Feature] 164 | Non annotated features 165 | charge: int 166 | Absolute value of the charge state of the isotopic envelope 167 | max_length: int 168 | maximum length ot the isotope candidates 169 | mz_tolerance: float 170 | min_similarity : float, default=0.9 171 | Minimum similarity to create candidates. 172 | bounds: dict 173 | bounds obtained with _make_m_bounds 174 | 175 | Returns 176 | ------- 177 | envelopes: 178 | List where each element is a list of indices with isotopic envelopes 179 | candidates. 180 | 181 | """ 182 | completed_candidates = list() 183 | candidates = [[mmi]] 184 | while candidates: 185 | # remove and extend a candidate 186 | candidate = candidates.pop() 187 | 188 | # find features with compatible m/z and similarities 189 | min_mz, max_mz = _get_next_mz_search_interval( 190 | candidate, bounds, charge, mz_tolerance 191 | ) 192 | start = bisect.bisect(features, min_mz) 193 | end = bisect.bisect(features, max_mz) 194 | new_features = list() 195 | for k in range(start, end): 196 | k_ft = features[k] 197 | is_similar = cache.get_similarity(mmi, k_ft) >= min_similarity 198 | is_non_annotated = k_ft in non_annotated 199 | if is_similar and is_non_annotated: 200 | new_features.append(k_ft) 201 | 202 | # extend candidates with compatible features 203 | length = len(candidate) 204 | if new_features and (length < max_length): 205 | tmp = [candidate + [x] for x in new_features] 206 | candidates.extend(tmp) 207 | else: 208 | completed_candidates.append(candidate) 209 | completed_candidates = [x for x in completed_candidates if len(x) > 1] 210 | return completed_candidates 211 | 212 | 213 | def _get_next_mz_search_interval( 214 | envelope: Sequence[Feature], 215 | elements_mass_difference: dict[int, Tuple[float, float]], 216 | charge: int, 217 | mz_tolerance: float, 218 | ) -> Tuple[float, float]: 219 | """ 220 | Computes the valid m/z range for a k-th isotopologue using information from 221 | m/z values from previous isotopologues. 222 | 223 | Parameters 224 | ---------- 225 | mz: sorted list 226 | List of previous found m/z values 227 | elements_mass_difference: dict 228 | bounds obtained with _make_m bounds 229 | charge: int 230 | mz_tolerance: float 231 | 232 | Returns 233 | ------- 234 | min_mz: minimum mz value for the M + k isotopologue 235 | max_mz: maximum mz value for the M + K isotopologue 236 | 237 | """ 238 | 239 | # If the charge is 0 (neutral mass) the results are the same as using 240 | # charge = 1. There is no difference between positive and negative 241 | # charges 242 | charge = max(1, abs(charge)) 243 | length = len(envelope) 244 | min_mz = envelope[-1].mz + 2 # dummy values 245 | max_mz = envelope[-1].mz - 2 246 | for dm, (min_dM, max_dM) in elements_mass_difference.items(): 247 | i = length - dm 248 | if i >= 0: 249 | min_mz = min(min_mz, envelope[i].mz + min_dM / charge) 250 | max_mz = max(max_mz, envelope[i].mz + max_dM / charge) 251 | min_mz -= mz_tolerance 252 | max_mz += mz_tolerance 253 | return min_mz, max_mz 254 | 255 | 256 | def _make_exact_mass_difference_bounds( 257 | elements: list[Element], min_p: float 258 | ) -> dict[int, Tuple[float, float]]: 259 | """ 260 | Computes possible mass differences obtaining from changing one isotope. 261 | 262 | Parameters 263 | ---------- 264 | elements: list of Elements 265 | min_p: number between 0 and 1. 266 | Minimum abundance of the isotopes used. 267 | 268 | Returns 269 | ------- 270 | bounds: dict 271 | mapping of possible nominal mass increments to exact mass increments, 272 | used by _get_k_bounds to estimate valid m/z ranges for isotopologues. 273 | 274 | """ 275 | bounds = dict() 276 | for e in elements: 277 | m, M, p = e.get_abundances() 278 | for i in range(1, M.size): 279 | if p[i] > min_p: 280 | dm = m[i] - m[0] 281 | dM = M[i] - M[0] 282 | dM_list = bounds.get(dm) 283 | if dM_list is None: 284 | bounds[dm] = [dM] 285 | else: 286 | dM_list.append(dM) 287 | 288 | for dm in bounds: 289 | bounds[dm] = min(bounds[dm]), max(bounds[dm]) 290 | return bounds 291 | -------------------------------------------------------------------------------- /src/tidyms/annotation/mmi_finder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import bisect 3 | from typing import Optional 4 | from .annotation_data import AnnotationData 5 | from ..chem.atoms import Element, PeriodicTable, EM 6 | from ..chem._formula_generator import FormulaCoefficientBounds 7 | from ..chem.envelope_tools import make_formula_coefficients_envelopes 8 | from ..lcms import Feature 9 | 10 | 11 | class MMIFinder: 12 | """ 13 | Finds Minimum Mass Isotopologue (MMI) candidates using an array of feature 14 | m/z and an array of feature area. 15 | 16 | """ 17 | 18 | def __init__( 19 | self, 20 | bounds: dict[str, tuple[int, int]], 21 | max_mass: float, 22 | max_charge: int, 23 | length: int, 24 | bin_size: int, 25 | mz_tol: float, 26 | p_tol: float, 27 | min_similarity: float, 28 | custom_abundances: Optional[dict[str, np.ndarray]] = None, 29 | ): 30 | """ 31 | Constructor method. 32 | 33 | Parameters 34 | ---------- 35 | bounds : dict 36 | Mapping from an element symbol str to the minimum and maximum 37 | allowed values in formulas. 38 | max_mass : float 39 | Maximum mass to build rules. 40 | length : int 41 | length of the theoretical envelopes used to compute the search 42 | rules. 43 | bin_size : int 44 | Mass interval used to build the rules. 45 | mz_tol : float 46 | m/z tolerance to search candidates. 47 | p_tol : float 48 | abundance tolerance used to search candidates. 49 | min_similarity : float, default=0.9 50 | Minimum similarity to create candidates. 51 | custom_abundances : dict, optional 52 | Provides custom elemental abundances. A mapping from element 53 | symbols str to an abundance array. The abundance array must have 54 | the same size that the natural abundance and its sum must be equal 55 | to one. For example, for "C", an alternative abundance can be 56 | array([0.15, 0.85]) for isotopes with nominal mass 12 and 13. 57 | 58 | """ 59 | self.rules = _create_rules_dict( 60 | bounds, max_mass, length, bin_size, p_tol, custom_abundances 61 | ) 62 | self.bin_size = bin_size 63 | self.max_charge = abs(max_charge) 64 | self.polarity = 1 if max_charge >= 0 else -1 65 | self.max_mass = max_mass 66 | self.mz_tol = mz_tol 67 | self.p_tol = p_tol 68 | self.min_similarity = min_similarity 69 | 70 | def find(self, data: AnnotationData) -> list[tuple[Feature, int]]: 71 | """ 72 | Search MMI candidates using m/z and area information from a feature 73 | list. 74 | 75 | Parameters 76 | ---------- 77 | features : list[Features] 78 | list of features sorted by m/z. 79 | mono: Feature 80 | Candidate to monoisotopic feature. 81 | 82 | Returns 83 | ------- 84 | mmi_candidates: list[tuple[int, int]] 85 | list of candidates assuming that the monoisotopic index is part of 86 | the envelope but not the MMI. 87 | 88 | """ 89 | mono = data.get_monoisotopologue() 90 | candidates = list() 91 | 92 | if mono is None: 93 | return candidates 94 | 95 | for charge in range(1, self.max_charge + 1): 96 | M_mono = mono.mz * charge - self.polarity * charge * EM 97 | if M_mono < self.max_mass: 98 | candidates.append((mono, charge)) 99 | M_bin = int(M_mono // self.bin_size) 100 | mmi_rules = self.rules.get(M_bin) 101 | if mmi_rules is not None: 102 | for i_rules in mmi_rules: 103 | i_candidates = _find_candidate( 104 | data, 105 | mono, 106 | charge, 107 | i_rules, 108 | self.mz_tol, 109 | self.p_tol, 110 | self.max_mass, 111 | self.min_similarity, 112 | ) 113 | candidates.extend(i_candidates) 114 | return candidates 115 | 116 | 117 | def _find_candidate( 118 | data: AnnotationData, 119 | mono: Feature, 120 | charge: int, 121 | i_rules: dict, 122 | mz_tol: float, 123 | p_tol: float, 124 | max_mass: float, 125 | min_similarity: float, 126 | ) -> list[tuple[int, int]]: 127 | # search valid m/z values 128 | min_dM, max_dM = i_rules["dM"] 129 | min_mz = mono.mz - max_dM / charge - mz_tol 130 | max_mz = mono.mz - min_dM / charge + mz_tol 131 | min_qp = i_rules["qp"][0] - p_tol 132 | max_qp = i_rules["qp"][1] + p_tol 133 | 134 | if (mono.mz * charge) < max_mass: 135 | start = bisect.bisect(data.features, min_mz) 136 | end = bisect.bisect(data.features, max_mz) 137 | else: 138 | start, end = 0, 0 # dummy values 139 | 140 | # if valid m/z where found, check if the abundance quotient qp is valid 141 | candidates = list() 142 | if start < end: 143 | for k in range(start, end): 144 | candidate = data.features[k] 145 | is_valid = _check_candidate(data, mono, candidate, min_similarity, min_qp, max_qp) 146 | if is_valid: 147 | candidates.append((candidate, charge)) 148 | return candidates 149 | 150 | 151 | def _check_candidate( 152 | data: AnnotationData, 153 | mono: Feature, 154 | candidate: Feature, 155 | min_similarity: float, 156 | min_qp: float, 157 | max_qp: float, 158 | ) -> bool: 159 | if candidate not in data.non_annotated: 160 | return False 161 | 162 | similarity = data.similarity_cache.get_similarity(mono, candidate) 163 | 164 | if similarity < min_similarity: 165 | return False 166 | 167 | mmi_mono_pair = [candidate, mono] 168 | _, p = mono.compute_isotopic_envelope(mmi_mono_pair) 169 | qp = p[1] / p[0] 170 | is_valid_qp = (qp >= min_qp) & (qp <= max_qp) 171 | 172 | return is_valid_qp 173 | 174 | 175 | def _create_rules_dict( 176 | bounds: dict[str, tuple[int, int]], 177 | max_mass: float, 178 | length: int, 179 | bin_size: int, 180 | p_tol: float, 181 | custom_abundances: Optional[dict[str, np.ndarray]], 182 | ) -> dict[int, list[dict[str, tuple[float, float]]]]: 183 | Ma, pa = _create_envelope_arrays(bounds, max_mass, length, custom_abundances) 184 | # find the monoisotopic index, its Mass difference with the MMI (dM) and 185 | # its abundance quotient with the MMI (qp) 186 | bins = (Ma[:, 0] // bin_size).astype(int) 187 | 188 | # find unique values for bins and monoisotopic index that will be used 189 | # as key for the rule dictionary 190 | unique_bins = np.unique(bins) 191 | # unique_mono_index = np.unique(mono_index) 192 | # unique_mono_index = unique_mono_index[unique_mono_index > 0] 193 | 194 | rules = dict() 195 | for b in unique_bins: 196 | b_rules = list() 197 | bin_mask = bins == b 198 | for mi in range(1, length): 199 | qp = pa[bin_mask, mi] / pa[bin_mask, 0] 200 | dM = Ma[bin_mask, mi] - Ma[bin_mask, 0] 201 | qp_mask = qp >= (1.0 - p_tol) 202 | if qp_mask.any(): 203 | mi_rules = dict() 204 | dM_b_mi = dM[qp_mask] 205 | qp_b_mi = qp[qp_mask] 206 | mi_rules["dM"] = dM_b_mi.min(), dM_b_mi.max() 207 | mi_rules["qp"] = qp_b_mi.min(), qp_b_mi.max() 208 | b_rules.append(mi_rules) 209 | if b_rules: 210 | rules[b] = b_rules 211 | return rules 212 | 213 | 214 | def _create_envelope_arrays( 215 | bounds: dict[str, tuple[int, int]], 216 | M_max: float, 217 | max_length: int, 218 | custom_abundances: Optional[dict[str, np.ndarray]], 219 | ) -> tuple[np.ndarray, np.ndarray]: 220 | elements = _select_elements(list(bounds), custom_abundances) 221 | isotopes = [x.get_mmi() for x in elements] 222 | f_bounds = FormulaCoefficientBounds({x: bounds[x.get_symbol()] for x in isotopes}) 223 | coeff = f_bounds.make_coefficients(M_max) 224 | envelope = make_formula_coefficients_envelopes( 225 | bounds, coeff, max_length, custom_abundances 226 | ) 227 | M = envelope.M 228 | p = envelope.p 229 | return M, p 230 | 231 | 232 | def _select_two_isotope_element( 233 | e_list: list[str], dm: int, custom_abundances: dict[str, np.ndarray] 234 | ) -> list[str]: 235 | selected = list() 236 | p_dm_max = 0 237 | best_p0_greater_than_pi = None 238 | for s in e_list: 239 | e = PeriodicTable().get_element(s) 240 | n_isotopes = len(e.isotopes) 241 | m, _, p = e.get_abundances() 242 | if n_isotopes == 2: 243 | e_dm = m[-1] - m[0] 244 | if e_dm == dm: 245 | p0, pi = custom_abundances.get(s, p) 246 | if pi > p0: 247 | selected.append(s) 248 | elif pi > p_dm_max: 249 | p_dm_max = pi 250 | best_p0_greater_than_pi = s 251 | if best_p0_greater_than_pi is not None: 252 | selected.append(best_p0_greater_than_pi) 253 | return selected 254 | 255 | 256 | def _select_multiple_isotope_elements(e_list: list[str]) -> list[str]: 257 | selected = list() 258 | for s in e_list: 259 | e = PeriodicTable().get_element(s) 260 | n_isotopes = len(e.isotopes) 261 | if n_isotopes > 2: 262 | selected.append(s) 263 | return selected 264 | 265 | 266 | def _select_elements( 267 | e_list: list[str], custom_abundances: Optional[dict[str, np.ndarray]] = None 268 | ) -> list[Element]: 269 | if custom_abundances is None: 270 | custom_abundances = dict() 271 | two_isotope_dm1 = _select_two_isotope_element(e_list, 1, custom_abundances) 272 | two_isotope_dm2 = _select_two_isotope_element(e_list, 2, custom_abundances) 273 | selected = _select_multiple_isotope_elements(e_list) 274 | if two_isotope_dm1 is not None: 275 | selected.extend(two_isotope_dm1) 276 | if two_isotope_dm2 is not None: 277 | selected.extend(two_isotope_dm2) 278 | selected = [PeriodicTable().get_element(x) for x in selected] 279 | return selected 280 | -------------------------------------------------------------------------------- /src/tidyms/chem/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Chemistry 3 | ========= 4 | 5 | Provides: 6 | 7 | 1. A Formula object to compute the exact mass and isotopic distribution of molecular formulas. 8 | 2. A PeriodicTable with element and isotope information. 9 | 3. A formula generator object to search molecular formulas based on exact mass values. 10 | 4. An EnvelopeScorer that scores the similarity between experimental and theoretical isotopic envelopes. 11 | 12 | Objects 13 | ------- 14 | - PeriodicTable 15 | - Formula 16 | - FormulaGenerator 17 | - EnvelopeScorer 18 | 19 | Constants 20 | --------- 21 | - EM : electron mass 22 | 23 | """ 24 | 25 | from ._formula_generator import FormulaGenerator, get_chnops_bounds 26 | from .envelope_tools import EnvelopeScorer, EnvelopeValidator 27 | from .formula import Formula 28 | from .atoms import EM, PeriodicTable 29 | -------------------------------------------------------------------------------- /src/tidyms/chem/_envelope_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Utilities to compute isotopic envelopes 4 | 5 | """ 6 | 7 | import numpy as np 8 | from functools import cache 9 | from scipy.stats import multinomial 10 | from typing import Dict, Optional, Tuple 11 | from .atoms import Isotope, PeriodicTable 12 | from . import utils 13 | 14 | 15 | def make_envelope_arrays( 16 | isotope: Isotope, n_min: int, n_max: int, max_length: int, p=None 17 | ) -> Tuple[np.ndarray, np.ndarray]: 18 | """ 19 | Creates an array of exact mass and abundance for homonuclear formulas. 20 | 21 | Parameters 22 | ---------- 23 | isotope : Isotope 24 | n_min : int 25 | Minimum formula coefficient 26 | n_max : int 27 | Maximum formula coefficient 28 | max_length : int 29 | Length of the envelope 30 | p : array or None, default=None 31 | Element abundance. If None, the natural abundance is used. 32 | 33 | Returns 34 | ------- 35 | M : (n_max - n_min + 1, max_length) array 36 | Coefficients exact mass. 37 | p : (n_max - n_min + 1, max_length) array 38 | Coefficients abundance. 39 | 40 | 41 | """ 42 | rows = n_max - n_min + 1 43 | M_arr = np.zeros((rows, max_length)) 44 | p_arr = np.zeros((rows, max_length)) 45 | for k in range(n_min, n_max + 1): 46 | Mk, pk = _get_n_atoms_envelope(isotope, k, max_length, p=p) 47 | M_arr[k - n_min] = Mk 48 | p_arr[k - n_min] = pk 49 | return M_arr, p_arr 50 | 51 | 52 | def find_formula_envelope( 53 | composition: Dict[Isotope, int], 54 | max_length: int, 55 | p: Optional[Dict[str, np.ndarray]] = None, 56 | min_p: float = 1e-10, 57 | ) -> Tuple[np.ndarray, np.ndarray]: 58 | """ 59 | Computes the isotopic envelope for a formula. 60 | 61 | """ 62 | if p is None: 63 | p = dict() 64 | 65 | # initialize an empty envelope for the formula 66 | Mf = np.zeros((1, max_length), dtype=float) 67 | pf = np.zeros((1, max_length), dtype=float) 68 | pf[0, 0] = 1 69 | 70 | for i, coeff in composition.items(): 71 | i_p = p.get(i.get_symbol()) 72 | Mi, pi = _get_n_atoms_envelope(i, coeff, max_length, p=i_p) 73 | Mi = Mi.reshape((1, Mi.size)) 74 | pi = pi.reshape((1, pi.size)) 75 | Mf, pf = combine_envelopes(Mf, pf, Mi, pi) 76 | valid_p_mask = pf >= min_p 77 | pf = pf[valid_p_mask].flatten() 78 | Mf = Mf[valid_p_mask].flatten() 79 | return Mf, pf 80 | 81 | 82 | def combine_envelopes( 83 | M1: np.ndarray, 84 | p1: np.ndarray, 85 | M2: np.ndarray, 86 | p2: np.ndarray, 87 | ) -> Tuple[np.ndarray, np.ndarray]: 88 | """ 89 | Combines exact mass and abundance of two envelopes. 90 | 91 | All arrays must be 2-dimensional and have the same shape. 92 | 93 | """ 94 | shape = M1.shape 95 | M = np.zeros(shape, dtype=float) 96 | p = np.zeros(shape, dtype=float) 97 | # Ignore zero division errors when normalizing by pk 98 | with np.errstate(divide='ignore', invalid='ignore'): 99 | for k in range(shape[1]): 100 | pk = (p1[:, : k + 1] * p2[:, k::-1]).sum(axis=1) 101 | k1 = k + 1 102 | k2 = k 103 | Mk = (p1[:, :k1] * M1[:, :k1] * p2[:, k2::-1]) + ( 104 | p1[:, :k1] * M2[:, k2::-1] * p2[:, k2::-1] 105 | ) 106 | M[:, k] = Mk.sum(axis=1) / pk 107 | p[:, k] = pk 108 | np.nan_to_num(M, copy=False) 109 | return M, p 110 | 111 | 112 | def _get_n_atoms_envelope( 113 | isotope: Isotope, n: int, max_length: int, p: Optional[np.ndarray] = None 114 | ) -> Tuple[np.ndarray, np.ndarray]: 115 | """ 116 | Computes the nominal mass, exact mass and abundance of n atoms. 117 | 118 | If the isotope is the monoisotope and p is ``None``, the natural abundances 119 | for the element are used. 120 | 121 | If the isotope is the monoisotope and custom abundance `p` is provided, the 122 | envelope is computed using this value instead of the natural abundances. 123 | 124 | If the isotopes is not the monoisotope, it is assumed that only this 125 | isotope contributes to the envelope. 126 | 127 | """ 128 | symbol = isotope.get_symbol() 129 | element = PeriodicTable().get_element(symbol) 130 | is_monoisotope = isotope.a == element.get_monoisotope().a 131 | n_isotopes = len(element.isotopes) 132 | if is_monoisotope and (n_isotopes > 1): 133 | if n == 0: 134 | M, p = _get_n_isotopes_envelope(isotope, n, max_length) 135 | elif p is None: 136 | M, p = _get_n_atoms_natural_abundance(symbol, n, max_length) 137 | else: 138 | m, M, _ = element.get_abundances() 139 | _validate_abundance(p, m, symbol) 140 | M, p = _get_n_atoms_envelope_aux(m, M, p, n, max_length) 141 | else: 142 | M, p = _get_n_isotopes_envelope(isotope, n, max_length) 143 | return M, p 144 | 145 | 146 | @cache 147 | def _get_n_atoms_natural_abundance(symbol: str, n: int, max_length: int): 148 | """ 149 | Computes the envelope of n atoms using the natural abundance. 150 | 151 | aux function to _get_n_atoms_envelope 152 | 153 | """ 154 | m, M, p = PeriodicTable().get_element(symbol).get_abundances() 155 | return _get_n_atoms_envelope_aux(m, M, p, n, max_length) 156 | 157 | 158 | def _get_n_atoms_envelope_aux( 159 | m: np.ndarray, M: np.ndarray, p: np.ndarray, n: int, max_length: int 160 | ) -> Tuple[np.ndarray, np.ndarray]: 161 | """ 162 | Computes the envelope of n atoms. 163 | 164 | aux function to _get_n_atoms_envelope. 165 | 166 | """ 167 | n_isotopes = p.size 168 | # find combinations of isotopes that sum n 169 | combinations = _find_n_isotope_combination(n_isotopes, n) 170 | 171 | # find m, M and p for each combination of isotopes 172 | multinomial_dist = multinomial(n, p) 173 | m = np.matmul(combinations, m) 174 | M = np.matmul(combinations, M) 175 | p = multinomial_dist.pmf(combinations) 176 | 177 | # sort by exact mass 178 | sorted_index = np.argsort(M) 179 | m, M, p = m[sorted_index], M[sorted_index], p[sorted_index] 180 | 181 | # merge values with the same nominal mass 182 | _, first_occurrence = np.unique(m, return_index=True) 183 | m_unique = np.zeros(max_length, dtype=m.dtype) 184 | M_unique = np.zeros(max_length, dtype=M.dtype) 185 | p_unique = np.zeros(max_length, dtype=p.dtype) 186 | # add the length of m_unique to include all nominal mass values 187 | n_unique = first_occurrence.size 188 | first_occurrence = list(first_occurrence) 189 | first_occurrence.append(m.size) 190 | m0 = m[0] 191 | for k in range(max_length): 192 | if k < n_unique: 193 | start = first_occurrence[k] 194 | end = first_occurrence[k + 1] 195 | mk = m[start] 196 | i = mk - m0 197 | if i < max_length: 198 | m_unique[i] = mk 199 | pk = np.sum(p[start:end]) 200 | p_unique[i] = pk 201 | M_unique[i] = np.sum(M[start:end] * p[start:end]) / pk 202 | p_unique = p_unique / np.sum(p_unique) 203 | return M_unique, p_unique 204 | 205 | 206 | def _fill_missing_nominal( 207 | m: np.ndarray, M: np.ndarray, p: np.ndarray, max_length: int 208 | ) -> Tuple[np.ndarray, np.ndarray]: 209 | rel_m = m - m[0] 210 | dm = np.arange(max_length) 211 | M_filled = np.zeros(max_length, dtype=M.dtype) 212 | p_filled = np.zeros(max_length, dtype=p.dtype) 213 | if not np.array_equal(rel_m, dm): 214 | for k, rel_m_k in enumerate(rel_m): 215 | if 0 <= rel_m_k < max_length: 216 | M_filled[rel_m_k] = M[k] 217 | p_filled[rel_m_k] = p[k] 218 | else: 219 | break 220 | M, p = M_filled, p_filled 221 | return M, p 222 | 223 | 224 | def _find_n_isotope_combination(n_isotopes, n): 225 | """ 226 | Finds combinations of isotopes such that the sum is n. 227 | 228 | aux function to _find_n_atoms_abundances. 229 | 230 | """ 231 | n_ranges = [range(x) for x in ([n + 1] * n_isotopes)] 232 | combinations = utils.cartesian_product(*n_ranges).astype(int) 233 | valid_combinations = combinations.sum(axis=1) == n 234 | combinations = combinations[valid_combinations, :] 235 | return combinations 236 | 237 | 238 | def _validate_abundance(p: np.ndarray, m: np.ndarray, symbol: str): 239 | """ 240 | Checks that user-created abundances are non-negative, normalized to 1 and 241 | has the same length as the number of stable isotopes. 242 | 243 | aux function to _get_n_atoms_envelope. 244 | 245 | """ 246 | is_all_non_negative = (p >= 0.0).all() 247 | is_normalized = np.isclose(p.sum(), 1.0) 248 | is_same_size = p.size == m.size 249 | if not is_same_size: 250 | msg = "{} has {} stable isotopes. `p` must have the same size." 251 | raise ValueError(msg.format(symbol, m.size)) 252 | elif not (is_normalized and is_all_non_negative): 253 | msg = "`p` elements must be non-negative and their sum normalized to 1." 254 | raise ValueError(msg) 255 | 256 | 257 | def _get_n_isotopes_envelope( 258 | isotope: Isotope, n: int, max_length: int 259 | ) -> Tuple[np.ndarray, np.ndarray]: 260 | """ 261 | Creates the isotopic envelope for n isotopes. 262 | 263 | aux function to _get_n_atoms_envelope. 264 | 265 | """ 266 | M = np.zeros(max_length, dtype=float) 267 | p = np.zeros(max_length, dtype=float) 268 | M[0] = isotope.m * n 269 | p[0] = 1.0 270 | return M, p 271 | -------------------------------------------------------------------------------- /src/tidyms/chem/atoms.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tools for working with Isotopes and Elements. 3 | 4 | Objects 5 | ------- 6 | - Element 7 | - Isotope 8 | - PeriodicTable 9 | 10 | Constants 11 | --------- 12 | - EM: Mass of the electron. 13 | 14 | Exceptions 15 | ---------- 16 | - InvalidIsotope 17 | 18 | """ 19 | import json 20 | import numpy as np 21 | import os.path 22 | from string import digits 23 | from typing import Dict, Final, Tuple, Union 24 | 25 | 26 | EM: Final[float] = 0.00054858 # electron mass 27 | 28 | 29 | class Isotope: 30 | """ 31 | Representation of an Isotope. 32 | 33 | Attributes 34 | ---------- 35 | z: int 36 | Atomic number 37 | n: int 38 | Neutron number 39 | a: int 40 | Mass number 41 | m: float 42 | Exact mass. 43 | defect: float 44 | Difference between the exact mass and mass number. 45 | abundance: float 46 | Relative abundance of the isotope. 47 | 48 | """ 49 | 50 | __slots__ = ("z", "n", "a", "m", "defect", "abundance") 51 | 52 | def __init__(self, z: int, a: int, m: float, abundance: float): 53 | self.z = z 54 | self.n = a - z 55 | self.a = a 56 | self.m = m 57 | self.defect = m - a 58 | self.abundance = abundance 59 | 60 | def __str__(self): 61 | return "{}{}".format(self.a, self.get_symbol()) 62 | 63 | def __repr__(self): 64 | return "Isotope({})".format(str(self)) 65 | 66 | def get_element(self) -> "Element": 67 | return PeriodicTable().get_element(self.z) 68 | 69 | def get_symbol(self) -> str: 70 | return self.get_element().symbol 71 | 72 | 73 | class Element(object): 74 | """ 75 | Representation of a chemical element. 76 | 77 | Attributes 78 | ---------- 79 | name : str 80 | Element name. 81 | symbol : str 82 | Element symbol 83 | isotopes : Dict[int, Isotope] 84 | Mapping from mass number to an isotope 85 | z : int 86 | Atomic number. 87 | nominal_mass : int 88 | Mass number of the most abundant isotope 89 | 90 | """ 91 | 92 | def __init__(self, symbol: str, name: str, isotopes: Dict[int, Isotope]): 93 | self.name = name 94 | self.symbol = symbol 95 | self.isotopes = isotopes 96 | monoisotope = self.get_monoisotope() 97 | self.z = monoisotope.z 98 | self.nominal_mass = monoisotope.a 99 | self.monoisotopic_mass = monoisotope.m 100 | self.mass_defect = self.monoisotopic_mass - self.nominal_mass 101 | 102 | def __repr__(self): 103 | return "Element({})".format(self.symbol) 104 | 105 | def __str__(self): # pragma: no cover 106 | return self.symbol 107 | 108 | def get_abundances(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: 109 | """ 110 | Returns the Mass number, exact mass and abundance of each Isotope. 111 | 112 | Returns 113 | ------- 114 | m: array[int] 115 | Mass number of each isotope. 116 | M: array[float] 117 | Exact mass of each isotope. 118 | p: array[float] 119 | Abundance of each isotope. 120 | 121 | """ 122 | isotopes = list(self.isotopes.values()) 123 | m = np.array([x.a for x in isotopes], dtype=int) 124 | M = np.array([x.m for x in isotopes]) 125 | p = np.array([x.abundance for x in isotopes]) 126 | return m, M, p 127 | 128 | def get_mmi(self) -> Isotope: 129 | """ 130 | Returns the isotope with the lowest atomic mass. 131 | 132 | """ 133 | return min(self.isotopes.values(), key=lambda x: x.a) 134 | 135 | def get_monoisotope(self) -> Isotope: 136 | """ 137 | Returns the most abundant isotope. 138 | 139 | """ 140 | return max(self.isotopes.values(), key=lambda x: x.abundance) 141 | 142 | 143 | def PeriodicTable(): 144 | """ 145 | Reference the PeriodicTable object. 146 | 147 | Examples 148 | -------- 149 | >>> import tidyms as ms 150 | >>> ptable = ms.chem.PeriodicTable() 151 | 152 | """ 153 | if _PeriodicTable.instance is None: 154 | _PeriodicTable.instance = _PeriodicTable() 155 | return _PeriodicTable.instance 156 | 157 | 158 | class _PeriodicTable: 159 | """ 160 | Periodic Table representation. Contains element and isotope information. 161 | 162 | Methods 163 | ------- 164 | get_element 165 | get_isotope 166 | 167 | """ 168 | 169 | instance = None 170 | 171 | def __init__(self): 172 | self._symbol_to_element = _make_periodic_table() 173 | self._z_to_element = {v.z: v for v in self._symbol_to_element.values()} 174 | self._za_to_isotope = dict() 175 | self._str_to_isotope = dict() 176 | for el_str in self._symbol_to_element: 177 | el = self._symbol_to_element[el_str] 178 | for isotope in el.isotopes.values(): 179 | self._za_to_isotope[(isotope.z, isotope.a)] = isotope 180 | self._str_to_isotope[str(isotope.a) + el_str] = isotope 181 | 182 | def get_element(self, element: Union[str, int]) -> Element: 183 | """ 184 | Returns an Element object using its symbol or atomic number. 185 | 186 | Parameters 187 | ---------- 188 | element : str or int 189 | element symbol or atomic number. 190 | 191 | Returns 192 | ------- 193 | Element 194 | 195 | Examples 196 | -------- 197 | >>> import tidyms as ms 198 | >>> ptable = ms.chem.PeriodicTable() 199 | >>> h = ptable.get_element("H") 200 | >>> c = ptable.get_element(6) 201 | 202 | """ 203 | if isinstance(element, int): 204 | element = self._z_to_element[element] 205 | else: 206 | element = self._symbol_to_element[element] 207 | return element 208 | 209 | def __iter__(self): 210 | for el in self._symbol_to_element.values(): 211 | yield el 212 | 213 | def get_isotope(self, x: str, copy: bool = False) -> Isotope: 214 | """ 215 | Returns an isotope object from a string representation. 216 | 217 | Parameters 218 | ---------- 219 | x : str 220 | A string representation of an isotope. If only the symbol is 221 | provided in the string, the monoisotope is returned. 222 | copy : bool 223 | If True creates a new Isotope object. 224 | 225 | Returns 226 | ------- 227 | Isotope 228 | 229 | Examples 230 | -------- 231 | >>> import tidyms as ms 232 | >>> ptable = ms.chem.PeriodicTable() 233 | >>> d = ptable.get_isotope("2H") 234 | >>> cl35 = ptable.get_isotope("Cl") 235 | 236 | """ 237 | try: 238 | if x[0] in digits: 239 | isotope = self._str_to_isotope[x] 240 | else: 241 | isotope = self.get_element(x).get_monoisotope() 242 | if copy: 243 | isotope = Isotope(isotope.z, isotope.a, isotope.m, isotope.abundance) 244 | return isotope 245 | except KeyError: 246 | msg = "{} is not a valid input.".format(x) 247 | raise InvalidIsotope(msg) 248 | 249 | 250 | def _make_periodic_table() -> Dict[str, Element]: 251 | this_dir, _ = os.path.split(__file__) 252 | elements_path = os.path.join(this_dir, "elements.json") 253 | with open(elements_path, "r") as fin: 254 | element_data = json.load(fin) 255 | 256 | isotopes_path = os.path.join(this_dir, "isotopes.json") 257 | with open(isotopes_path, "r") as fin: 258 | isotope_data = json.load(fin) 259 | 260 | periodic_table = dict() 261 | for element in isotope_data: 262 | element_isotopes = isotope_data[element] 263 | isotopes = {x["a"]: Isotope(**x) for x in element_isotopes} 264 | name = element_data[element] 265 | periodic_table[element] = Element(element, name, isotopes) 266 | return periodic_table 267 | 268 | 269 | class InvalidIsotope(ValueError): 270 | pass 271 | -------------------------------------------------------------------------------- /src/tidyms/chem/elements.json: -------------------------------------------------------------------------------- 1 | { 2 | "Xx": "Dummy", 3 | "H": "Hydrogen", 4 | "He": "Helium", 5 | "Li": "Lithium", 6 | "Be": "Beryllium", 7 | "B": "Boron", 8 | "C": "Carbon", 9 | "N": "Nitrogen", 10 | "O": "Oxygen", 11 | "F": "Fluorine", 12 | "Ne": "Neon", 13 | "Na": "Sodium", 14 | "Mg": "Magnesium", 15 | "Al": "Aluminium", 16 | "Si": "Silicon", 17 | "P": "Phosphorus", 18 | "S": "Sulfur", 19 | "Cl": "Chlorine", 20 | "Ar": "Argon", 21 | "K": "Potassium", 22 | "Ca": "Calcium", 23 | "Sc": "Scandium", 24 | "Ti": "Titanium", 25 | "V": "Vanadium", 26 | "Cr": "Chromium", 27 | "Mn": "Manganese", 28 | "Fe": "Iron", 29 | "Co": "Cobalt", 30 | "Ni": "Nickel", 31 | "Cu": "Copper", 32 | "Zn": "Zinc", 33 | "Ga": "Gallium", 34 | "Ge": "Germanium", 35 | "As": "Arsenic", 36 | "Se": "Selenium", 37 | "Br": "Bromine", 38 | "Kr": "Krypton", 39 | "Rb": "Rubidium", 40 | "Sr": "Strontium", 41 | "Y": "Yttrium", 42 | "Zr": "Zirconium", 43 | "Nb": "Niobium", 44 | "Mo": "Molybdenum", 45 | "Tc": "Technetium", 46 | "Ru": "Ruthenium", 47 | "Rh": "Rhodium", 48 | "Pd": "Palladium", 49 | "Ag": "Silver", 50 | "Cd": "Cadmium", 51 | "In": "Indium", 52 | "Sn": "Tin", 53 | "Sb": "Antimony", 54 | "Te": "Tellurium", 55 | "I": "Iodine", 56 | "Xe": "Xenon", 57 | "Cs": "Caesium", 58 | "Ba": "Barium", 59 | "La": "Lanthanum", 60 | "Ce": "Cerium", 61 | "Pr": "Praseodymium", 62 | "Nd": "Neodymium", 63 | "Pm": "Promethium", 64 | "Sm": "Samarium", 65 | "Eu": "Europium", 66 | "Gd": "Gadolinium", 67 | "Tb": "Terbium", 68 | "Dy": "Dysprosium", 69 | "Ho": "Holmium", 70 | "Er": "Erbium", 71 | "Tm": "Thulium", 72 | "Yb": "Ytterbium", 73 | "Lu": "Lutetium", 74 | "Hf": "Hafnium", 75 | "Ta": "Tantalum", 76 | "W": "Tungsten", 77 | "Re": "Rhenium", 78 | "Os": "Osmium", 79 | "Ir": "Iridium", 80 | "Pt": "Platinum", 81 | "Au": "Gold", 82 | "Hg": "Mercury", 83 | "Tl": "Thallium", 84 | "Pb": "Lead", 85 | "Bi": "Bismuth", 86 | "Po": "Polonium", 87 | "At": "Astatine", 88 | "Rn": "Radon", 89 | "Fr": "Francium", 90 | "Ra": "Radium", 91 | "Ac": "Actinium", 92 | "Th": "Thorium", 93 | "Pa": "Protactinium", 94 | "U": "Uranium", 95 | "Np": "Neptunium", 96 | "Pu": "Plutonium", 97 | "Am": "Americium", 98 | "Cm": "Curium", 99 | "Bk": "Berkelium", 100 | "Cf": "Californium", 101 | "Es": "Einsteinium", 102 | "Fm": "Fermium", 103 | "Md": "Mendelevium", 104 | "No": "Nobelium", 105 | "Lr": "Lawrencium", 106 | "Rf": "Rutherfordium", 107 | "Db": "Dubnium", 108 | "Sg": "Seaborgium", 109 | "Bh": "Bohrium", 110 | "Hs": "Hassium", 111 | "Mt": "Meitnerium", 112 | "Ds": "Darmstadtium", 113 | "Rg": "Roentgenium", 114 | "Cn": "Copernicium", 115 | "Uut": "Ununtrium", 116 | "Fl": "Flerovium", 117 | "Uup": "Ununpentium", 118 | "Lv": "Livermorium", 119 | "Uus": "Ununseptium", 120 | "Uuo": "Ununoctium" 121 | } -------------------------------------------------------------------------------- /src/tidyms/chem/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | functions and classes used in different modules 4 | """ 5 | 6 | import numpy as np 7 | 8 | 9 | def cartesian_product(*args): 10 | res = None 11 | for x in args: 12 | if res is None: 13 | # initialize cartesian product array 14 | res = np.array(x) 15 | res = res.reshape((res.size, 1)) 16 | else: 17 | x = np.array(x) 18 | row, col = res.shape 19 | new_res_shape = (row * x.size, col + 1) 20 | new_res = np.zeros(shape=new_res_shape, dtype=res.dtype) 21 | ind = np.repeat(np.arange(row), x.size) 22 | new_col = np.tile(x, row) 23 | new_res[:, :col] = res[ind] 24 | new_res[:, -1] = new_col 25 | res = new_res 26 | return res 27 | -------------------------------------------------------------------------------- /test_requirements.txt: -------------------------------------------------------------------------------- 1 | pytest>=7.1.0 2 | pytest-cov>=3.0.0 -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/griquelme/tidyms/ad9356a099f367076f745406be23bb4c50003239/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from tidyms.simulation import simulate_dataset 4 | from tidyms.container import DataContainer 5 | from tidyms import fileio 6 | from tidyms.utils import get_tidyms_path 7 | import numpy as np 8 | import pytest 9 | import os 10 | 11 | 12 | # simulated data used for tests 13 | def pytest_sessionstart(session): 14 | for dataset in fileio.list_available_datasets(False): 15 | fileio.download_dataset(dataset) 16 | 17 | 18 | @pytest.fixture 19 | def data_container_with_order(): 20 | population = {"healthy": 20, "disease": 35} 21 | mean = {"healthy": np.array([50, 100, 150]), "disease": np.array([150, 200, 300])} 22 | cov = {"healthy": np.array([1, 1, 1]), "disease": np.array([2, 2, 2])} 23 | blank_contribution = np.array([3, 5, 10]) 24 | mz = np.array([100, 200, 300]) 25 | rt = np.array([50, 60, 70]) 26 | dc = simulate_dataset( 27 | population, mean, cov, mz, rt, blank_contribution, prepend_blank=1, append_blank=1 28 | ) 29 | return dc 30 | 31 | 32 | @pytest.fixture 33 | def data_container_with_order_single_qc(): 34 | population = {"healthy": 20, "disease": 35} 35 | mean = {"healthy": np.array([50, 100, 150]), "disease": np.array([150, 200, 300])} 36 | cov = {"healthy": np.array([1, 1, 1]), "disease": np.array([2, 2, 2])} 37 | blank_contribution = np.array([3, 5, 10]) 38 | mz = np.array([100, 200, 300]) 39 | rt = np.array([50, 60, 70]) 40 | dc = simulate_dataset( 41 | population, 42 | mean, 43 | cov, 44 | mz, 45 | rt, 46 | blank_contribution, 47 | prepend_blank=1, 48 | append_blank=1, 49 | triple_qc=False, 50 | ) 51 | return dc 52 | 53 | 54 | @pytest.fixture 55 | def data_container_without_order(data_container_with_order): 56 | dc = data_container_with_order 57 | dm = dc.data_matrix.copy() 58 | sm = dc.sample_metadata.copy() 59 | sm.pop("order") 60 | sm.pop("batch") 61 | fm = dc.feature_metadata.copy() 62 | mapping = {k: v for k, v in dc.mapping.items() if v is not None} 63 | return DataContainer(dm, fm, sm, mapping) 64 | 65 | 66 | @pytest.fixture 67 | def centroid_mzml(): 68 | cache_path = get_tidyms_path() 69 | dataset_name = "test-raw-data" 70 | filename = "centroid-data-zlib-indexed-compressed.mzML" 71 | data_path = os.path.join(cache_path, dataset_name, filename) 72 | ms_data = fileio.MSData.create_MSData_instance(data_path, ms_mode="profile") 73 | return ms_data 74 | 75 | 76 | @pytest.fixture 77 | def profile_mzml(): 78 | cache_path = get_tidyms_path() 79 | filename = "profile-data-zlib-indexed-compressed.mzML" 80 | data_path = os.path.join(cache_path, "test-raw-data", filename) 81 | ms_data = fileio.MSData.create_MSData_instance(data_path, ms_mode="profile") 82 | return ms_data 83 | -------------------------------------------------------------------------------- /tests/integration/test_assay_real_data.py: -------------------------------------------------------------------------------- 1 | import tidyms as ms 2 | import numpy as np 3 | import pytest 4 | from pathlib import Path 5 | 6 | 7 | @pytest.fixture 8 | def assay(tmpdir) -> ms.Assay: 9 | tidyms_path = ms.fileio.get_tidyms_path() 10 | data_path = Path(tidyms_path).joinpath("test-nist-raw-data") 11 | assay_path = Path(tmpdir).joinpath("test-assay") 12 | return ms.Assay(assay_path, data_path) 13 | 14 | 15 | @pytest.fixture 16 | def detect_features_params() -> dict: 17 | # a list of known m/z values to reduce computing time 18 | mz_list = np.array([144.081, 146.060, 195.086, 189.0734, 205.0967, 188.071]) 19 | return { 20 | "tolerance": 0.015, 21 | "min_intensity": 5000, 22 | "targeted_mz": mz_list, 23 | } 24 | 25 | 26 | @pytest.fixture 27 | def extract_features_params() -> dict: 28 | return {"store_smoothed": True} 29 | 30 | 31 | def test_detect_features(assay, detect_features_params): 32 | assay.detect_features(**detect_features_params) 33 | assert True 34 | 35 | 36 | def test_extract_features( 37 | assay, 38 | detect_features_params, 39 | extract_features_params 40 | ): 41 | assay.detect_features(**detect_features_params) 42 | assay.extract_features(**extract_features_params) 43 | assert True 44 | 45 | 46 | def test_describe_features( 47 | assay, 48 | detect_features_params, 49 | extract_features_params 50 | ): 51 | assay.detect_features(**detect_features_params) 52 | assay.extract_features(**extract_features_params) 53 | assay.describe_features() 54 | assert True 55 | 56 | 57 | def test_build_feature_table( 58 | assay, 59 | detect_features_params, 60 | extract_features_params 61 | ): 62 | assay.detect_features(**detect_features_params) 63 | assay.extract_features(**extract_features_params) 64 | assay.describe_features() 65 | assay.build_feature_table() 66 | assert True 67 | 68 | 69 | def test_match_features( 70 | assay, 71 | detect_features_params, 72 | extract_features_params 73 | ): 74 | assay.detect_features(**detect_features_params) 75 | assay.extract_features(**extract_features_params) 76 | assay.describe_features() 77 | assay.build_feature_table() 78 | assay.match_features() 79 | assert True 80 | 81 | 82 | def test_build_data_matrix( 83 | assay, 84 | detect_features_params, 85 | extract_features_params 86 | ): 87 | assay.detect_features(**detect_features_params) 88 | assay.extract_features(**extract_features_params) 89 | assay.describe_features() 90 | assay.build_feature_table() 91 | assay.match_features() 92 | assay.make_data_matrix() 93 | assert True -------------------------------------------------------------------------------- /tests/integration/test_real_raw_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test lcms and fileio functionality with real data. 3 | 4 | """ 5 | 6 | import tidyms as ms 7 | import numpy as np 8 | import pytest 9 | import os 10 | 11 | 12 | @pytest.fixture 13 | def ms_data_centroid() -> ms.MSData: 14 | tidyms_path = ms.fileio.get_tidyms_path() 15 | filename = "centroid-data-zlib-indexed-compressed.mzML" 16 | data_path = os.path.join(tidyms_path, "test-raw-data", filename) 17 | return ms.MSData.create_MSData_instance(data_path) 18 | 19 | 20 | def test_ms_data_invalid_ms_mode_setter(ms_data_centroid): 21 | with pytest.raises(ValueError): 22 | ms_data_centroid.ms_mode = "invalid-mode" 23 | 24 | 25 | def test_ms_data_invalid_instrument_setter(ms_data_centroid): 26 | with pytest.raises(ValueError): 27 | ms_data_centroid.instrument = "invalid-instrument" 28 | 29 | 30 | def test_ms_data_invalid_separation_setter(ms_data_centroid): 31 | with pytest.raises(ValueError): 32 | ms_data_centroid.separation = "invalid-separation" 33 | 34 | 35 | def test_make_chromatogram_ms_level_1(ms_data_centroid): 36 | mz = np.array([205.098, 524.37, 188.07]) # some m/z observed in the data 37 | chromatograms = ms.make_chromatograms(ms_data_centroid, mz) 38 | rt = list() 39 | for _, sp in ms_data_centroid.get_spectra_iterator(ms_level=1): 40 | rt.append(sp.time) 41 | rt = np.array(rt) 42 | for c in chromatograms: 43 | assert np.array_equal(rt, c.time) 44 | assert c.time.size == c.spint.size 45 | 46 | 47 | def test_ms_data_get_spectrum(ms_data_centroid): 48 | ms_data_centroid.get_spectrum(0) 49 | assert True 50 | 51 | 52 | def test_make_tic_ms_level_1(ms_data_centroid): 53 | tic = ms.make_tic(ms_data_centroid, ms_level=1) 54 | rt = list() 55 | for _, sp in ms_data_centroid.get_spectra_iterator(ms_level=1): 56 | rt.append(sp.time) 57 | rt = np.array(rt) 58 | assert np.array_equal(rt, tic.time) 59 | assert tic.time.size == tic.spint.size 60 | 61 | 62 | def test_make_chromatogram_ms_level_2(ms_data_centroid): 63 | mz = np.array([205.098, 524.37, 188.07]) # some m/z observed in the data 64 | ms_level = 2 65 | chromatograms = ms.make_chromatograms( 66 | ms_data_centroid, mz, ms_level=ms_level) 67 | rt = list() 68 | for _, sp in ms_data_centroid.get_spectra_iterator(ms_level=ms_level): 69 | rt.append(sp.time) 70 | rt = np.array(rt) 71 | for c in chromatograms: 72 | assert np.array_equal(rt, c.time) 73 | assert c.time.size == c.spint.size 74 | 75 | 76 | def test_make_roi(ms_data_centroid): 77 | roi_list = ms.make_roi(ms_data_centroid) 78 | for r in roi_list: 79 | # The three arrays must have the same size 80 | assert r.time.size == r.spint.size 81 | assert r.time.size == r.scan.size 82 | 83 | 84 | def test_accumulate_spectra(ms_data_centroid): 85 | sp = ms.accumulate_spectra(ms_data_centroid, start_time=20, end_time=30) 86 | assert sp.mz.size == sp.spint.size 87 | -------------------------------------------------------------------------------- /tests/unit/annotation/test_annotation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from tidyms.annotation import annotation 5 | from tidyms.raw_data_utils import make_roi 6 | from tidyms.fileio import MSData_simulated 7 | from tidyms.lcms import Peak 8 | from tidyms.chem import Formula 9 | 10 | 11 | @pytest.fixture 12 | def annotation_tools_params(): 13 | bounds = { 14 | "C": (0, 50), 15 | "H": (0, 100), 16 | "O": (0, 20), 17 | "N": (0, 20), 18 | "Cl": (0, 2), 19 | "B": (0, 1), 20 | } 21 | params = { 22 | "bounds": bounds, 23 | "max_mass": 2500, 24 | "max_length": 10, 25 | "max_charge": 3, 26 | "min_M_tol": 0.005, 27 | "max_M_tol": 0.01, 28 | "p_tol": 0.05, 29 | "min_similarity": 0.9, 30 | "min_p": 0.01, 31 | } 32 | return params 33 | 34 | 35 | def test__annotate_empty_feature_list(annotation_tools_params): 36 | tools = annotation.create_annotation_tools(**annotation_tools_params) 37 | feature_list = list() 38 | annotation.annotate(feature_list, *tools) 39 | 40 | 41 | @pytest.fixture 42 | def compound_data(): 43 | compounds = [ 44 | "[C10H20O2]-", 45 | "[C10H20BO3]-", 46 | "[C20H40BO5]2-", 47 | "[C18H19N2O3]-", 48 | "[C18H20N2O3Cl]-", 49 | "[C10H20Cl]-", 50 | ] 51 | rt_list = [50, 75, 150, 200, 200, 175] 52 | amp_list = [10000, 20000, 30000, 25000, 25000, 20000] 53 | return compounds, rt_list, amp_list 54 | 55 | 56 | @pytest.fixture 57 | def feature_list(compound_data) -> list[Peak]: 58 | compounds, rt_list, amp_list = compound_data 59 | mz_grid = np.linspace(100, 1200, 20000) 60 | rt_grid = np.arange(300) 61 | rt_params = list() 62 | mz_params = list() 63 | width = 4 64 | for comp, c_amp, c_rt in zip(compounds, amp_list, rt_list): 65 | f = Formula(comp) 66 | cM, cp = f.get_isotopic_envelope(4) 67 | cmz = [[x, y] for x, y in zip(cM, cp)] 68 | crt = [[c_rt, width, c_amp] for _ in cM] 69 | rt_params.append(crt) 70 | mz_params.append(cmz) 71 | mz_params = np.vstack(mz_params) 72 | rt_params = np.vstack(rt_params) 73 | ms_data = MSData_simulated(mz_grid, rt_grid, mz_params, rt_params, noise=0.025) 74 | 75 | roi_list = make_roi(ms_data, tolerance=0.01) 76 | ft_list = list() 77 | for k, r in enumerate(roi_list): 78 | r.extract_features() 79 | r.index = k 80 | if r.features: 81 | for j, ft in enumerate(r.features): 82 | ft.index = j 83 | ft_list.extend(r.features) 84 | return ft_list 85 | 86 | 87 | def test_annotate(feature_list, annotation_tools_params): 88 | tools = annotation.create_annotation_tools(**annotation_tools_params) 89 | annotation.annotate(feature_list, *tools) 90 | 91 | # group features by isotopologue label. 92 | annotation_check = dict() 93 | for ft in feature_list: 94 | group_list = annotation_check.setdefault(ft.annotation.isotopologue_label, list()) 95 | group_list.append(ft) 96 | annotation_check.pop(-1) 97 | assert len(annotation_check) == 6 98 | for v in annotation_check.values(): 99 | assert len(v) == 4 # features where generated with 4 isotopologues. 100 | -------------------------------------------------------------------------------- /tests/unit/annotation/test_envelope_finder.py: -------------------------------------------------------------------------------- 1 | from tidyms.annotation import envelope_finder as ef 2 | from tidyms.annotation.annotation_data import AnnotationData 3 | from tidyms.chem import PeriodicTable 4 | from tidyms.chem import Formula 5 | from tidyms.lcms import LCTrace, Peak 6 | import pytest 7 | import numpy as np 8 | from collections.abc import Sequence 9 | 10 | 11 | @pytest.fixture 12 | def formulas(): 13 | formulas = { 14 | "cho": [ 15 | "C27H34O9", 16 | "C62H120O6", 17 | "C59H114O6", 18 | "C62H120O6", 19 | "C56H42O10", 20 | "C17H20O4", 21 | "C54H104O6", 22 | "C48H92O6", 23 | "C52H100O6", 24 | "C54H104O6", 25 | "C47H90O6", 26 | "C50H96O6", 27 | "C56H108O6", 28 | "C21H19O13", 29 | "C57H94O6", 30 | "C58H112O6", 31 | "C64H124O6", 32 | "C24H20O8", 33 | "C17H12O6", 34 | "C61H118O6", 35 | "C47H90O6", 36 | "C6H12O6", 37 | "C63H106O6", 38 | "C40H52O4", 39 | "C61H118O6", 40 | "C61H118O6", 41 | "C57H96O6", 42 | "C37H72O5", 43 | "C28H44O2", 44 | "C29H24O12", 45 | "C51H98O6", 46 | "C39H72O5", 47 | "C46H78O7", 48 | "C54H104O6", 49 | "C63H110O6", 50 | "C21H18O13", 51 | "C53H102O6", 52 | "C62H120O6", 53 | "C59H114O6", 54 | "C41H78O6", 55 | "C25H30O6", 56 | "C51H98O6", 57 | "C53H102O6", 58 | "C43H68O13", 59 | "C37H72O5", 60 | "C59H114O6", 61 | "C15H12O4", 62 | "C16H18O4", 63 | "C61H110O6", 64 | "C58H112O6", 65 | ], 66 | "chnops": [ 67 | "C41H80NO8P", 68 | "C54H104O6", 69 | "C27H40O2", 70 | "C24H26O12", 71 | "C55H106O6", 72 | "C45H80O16P2", 73 | "C50H96O6", 74 | "C8H13NO", 75 | "C35H36O15", 76 | "C48H92O6", 77 | "C63H98O6", 78 | "C15H14O5", 79 | "C18H23N3O6", 80 | "C44H80NO8P", 81 | "C47H90O6", 82 | "C47H84O16P2", 83 | "C14H14O4", 84 | "C46H80NO10P", 85 | "C35H64O9", 86 | "C51H98O6", 87 | "C6H12O6", 88 | "C26H34O7", 89 | "C17H18O4", 90 | "C6H8O9S", 91 | "C63H100O6", 92 | "C51H98O6", 93 | "C6H12O", 94 | "C50H96O6", 95 | "C56H108O6", 96 | "C61H114O6", 97 | "C57H110O6", 98 | "C44H76NO8P", 99 | "C63H110O6", 100 | "C41H71O8P", 101 | "C16H16O10", 102 | "C21H20O15", 103 | "C4H6O3", 104 | "C16H18O9", 105 | "C51H98O6", 106 | "C57H94O6", 107 | "C4H9NO2", 108 | "C56H108O6", 109 | "C6H8O7", 110 | "C57H98O6", 111 | "C63H110O6", 112 | "C58H112O6", 113 | "C12H16O7S", 114 | "C27H30O12", 115 | "C26H28O16", 116 | "C27H38O12", 117 | ], 118 | } 119 | return formulas 120 | 121 | 122 | @pytest.fixture 123 | def elements(): 124 | elements = {"cho": ["C", "H", "O"], "chnops": ["C", "H", "N", "O", "P", "S"]} 125 | return elements 126 | 127 | 128 | def create_feature_list_from_formula(f_str: str) -> Sequence[Peak]: 129 | f = Formula(f_str) 130 | M, _ = f.get_isotopic_envelope() 131 | if f.charge: 132 | mz = M / abs(f.charge) 133 | else: 134 | mz = M 135 | feature_list = list() 136 | for k_mz in mz: 137 | size = 30 138 | time = np.linspace(0, size, size) 139 | scan = np.arange(size) 140 | spint = np.ones(size) 141 | roi = LCTrace(time, spint, spint * k_mz, scan) 142 | peak = Peak(10, 15, 20, roi) 143 | feature_list.append(peak) 144 | return feature_list 145 | 146 | 147 | @pytest.mark.parametrize("element_set", ["cho", "chnops"]) 148 | def test__make_exact_mass_difference_bounds(elements, element_set): 149 | # test bounds for different element combinations 150 | elements = elements[element_set] 151 | elements = [PeriodicTable().get_element(x) for x in elements] 152 | bounds = ef._make_exact_mass_difference_bounds(elements, 0.0) 153 | # m and M are the bounds for each nominal mass increment 154 | for e in elements: 155 | nom, ex, ab = e.get_abundances() 156 | nom = nom - nom[0] 157 | ex = ex - ex[0] 158 | for i, mi in zip(nom[1:], ex[1:]): 159 | m_min, m_max = bounds[i] 160 | assert m_min <= mi 161 | assert m_max >= mi 162 | 163 | 164 | @pytest.mark.parametrize("element_set", ["cho", "chnops"]) 165 | def test__get_next_mz_search_interval_mz(elements, formulas, element_set): 166 | elements = elements[element_set] 167 | elements = [PeriodicTable().get_element(x) for x in elements] 168 | dM_bounds = ef._make_exact_mass_difference_bounds(elements, 0.0) 169 | # test bounds for different formulas 170 | for f_str in formulas[element_set]: 171 | feature_list = create_feature_list_from_formula(f_str) 172 | length = len(feature_list) 173 | for k in range(1, length - 1): 174 | k_ft = feature_list[k] 175 | min_mz, max_mz = ef._get_next_mz_search_interval( 176 | feature_list[:k], dM_bounds, 1, 0.005 177 | ) 178 | assert (min_mz < k_ft.mz) and (k_ft.mz < max_mz) 179 | 180 | 181 | @pytest.mark.parametrize("charge", list(range(1, 6))) 182 | def test_get_k_bounds_multiple_charges(elements, formulas, charge): 183 | elements = elements["chnops"] 184 | formulas = formulas["chnops"] 185 | elements = [PeriodicTable().get_element(x) for x in elements] 186 | bounds = ef._make_exact_mass_difference_bounds(elements, 0.0) 187 | for f_str in formulas: 188 | features = create_feature_list_from_formula(f"[{f_str}]{charge}+") 189 | length = len(features) 190 | for k in range(1, length - 1): 191 | m_min, m_max = ef._get_next_mz_search_interval( 192 | features[:k], bounds, charge, 0.005 193 | ) 194 | assert (m_min < features[k]) and (features[k] < m_max) 195 | 196 | 197 | @pytest.mark.parametrize( 198 | "elements_set,charge", [["cho", 1], ["cho", 2], ["chnops", 1], ["chnops", 2]] 199 | ) 200 | def test__find_envelopes(formulas, elements, elements_set, charge): 201 | # test that the function works using as a list m/z values generated from 202 | # formulas. 203 | elements = elements[elements_set] 204 | formulas = formulas[elements_set] 205 | elements = [PeriodicTable().get_element(x) for x in elements] 206 | bounds = ef._make_exact_mass_difference_bounds(elements, 0.0) 207 | max_length = 10 208 | mz_tol = 0.005 209 | min_similarity = 0.9 210 | for f_str in formulas: 211 | f_str = f"[{f_str}]{charge}+" 212 | features = create_feature_list_from_formula(f_str) 213 | data = AnnotationData(features) 214 | mmi = data.features[0] 215 | results = ef._find_envelopes( 216 | data.features, 217 | mmi, 218 | data.non_annotated, 219 | data.similarity_cache, 220 | charge, 221 | max_length, 222 | mz_tol, 223 | min_similarity, 224 | bounds, 225 | ) 226 | expected = features 227 | assert results[0] == expected 228 | 229 | 230 | @pytest.mark.parametrize("elements_set", ["cho", "chnops"]) 231 | def test__find_envelopes_no_charge(formulas, elements, elements_set): 232 | # test that the function works using as a list m/z values generated from 233 | # formulas. 234 | elements = elements[elements_set] 235 | formulas = formulas[elements_set] 236 | elements = [PeriodicTable().get_element(x) for x in elements] 237 | bounds = ef._make_exact_mass_difference_bounds(elements, 0.0) 238 | max_length = 10 239 | charge = 0 240 | mz_tol = 0.005 241 | min_similarity = 0.9 242 | for f_str in formulas: 243 | features = create_feature_list_from_formula(f_str) 244 | data = AnnotationData(features) 245 | mmi = features[0] 246 | results = ef._find_envelopes( 247 | features, 248 | mmi, 249 | data.non_annotated, 250 | data.similarity_cache, 251 | charge, 252 | max_length, 253 | mz_tol, 254 | min_similarity, 255 | bounds, 256 | ) 257 | expected = features 258 | assert results[0] == expected 259 | 260 | 261 | def test_EnvelopeFinder(elements, formulas): 262 | elements = elements["chnops"] 263 | formulas = formulas["chnops"] 264 | envelope_finder = ef.EnvelopeFinder(elements, 0.005, max_length=10) 265 | charge = 1 266 | for f_str in formulas: 267 | features = create_feature_list_from_formula(f_str) 268 | mmi = features[0] 269 | data = AnnotationData(features) 270 | results = envelope_finder.find(data, mmi, charge) 271 | expected = features 272 | assert len(results) == 1 273 | assert results[0] == expected 274 | -------------------------------------------------------------------------------- /tests/unit/annotation/test_mmi_finder.py: -------------------------------------------------------------------------------- 1 | from tidyms.annotation import mmi_finder 2 | from tidyms.annotation.annotation_data import AnnotationData 3 | from tidyms.chem import PeriodicTable 4 | from tidyms.lcms import LCTrace, Peak 5 | import pytest 6 | import numpy as np 7 | from typing import Sequence 8 | 9 | 10 | def test__select_two_isotope_elements_dm_1_p0_greater_than_pi(): 11 | elements = ["C", "H", "N", "O", "P", "S"] 12 | expected = ["C"] 13 | custom_abundances = dict() 14 | dm = 1 15 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances) 16 | assert len(res) == len(expected) 17 | assert set(res) == set(expected) 18 | 19 | 20 | def test__select_two_isotope_elements_dm_1_p0_greater_than_pi_custom_abundance(): 21 | elements = ["C", "H", "N", "O", "P", "S"] 22 | expected = ["H"] 23 | custom_abundances = {"H": np.array([0.95, 0.05])} 24 | dm = 1 25 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances) 26 | assert len(res) == len(expected) 27 | assert set(res) == set(expected) 28 | 29 | 30 | def test__select_two_isotope_elements_dm_1_no_elements(): 31 | elements = ["O", "P", "S"] 32 | custom_abundances = {} 33 | dm = 1 34 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances) 35 | assert len(res) == 0 36 | 37 | 38 | def test__select_two_isotope_elements_dm_1_p0_lower_than_pi(): 39 | elements = ["B", "Li", "O", "P", "S"] 40 | expected = ["B", "Li"] 41 | dm = 1 42 | custom_abundances = dict() 43 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances) 44 | assert len(res) == len(expected) 45 | assert set(res) == set(expected) 46 | 47 | 48 | def test__select_two_isotope_elements_dm_1_p0_lower_and_higher_than_pi(): 49 | elements = ["C", "H", "B", "Li", "O", "P", "S"] 50 | expected = ["C", "B", "Li"] 51 | dm = 1 52 | custom_abundances = dict() 53 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances) 54 | assert len(res) == len(expected) 55 | assert set(res) == set(expected) 56 | 57 | 58 | def test__select_two_isotope_elements_dm_2_p0_greater_than_pi(): 59 | elements = ["Cl", "H", "N", "O", "P", "S"] 60 | expected = ["Cl"] 61 | custom_abundances = dict() 62 | dm = 2 63 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances) 64 | assert len(res) == len(expected) 65 | assert set(res) == set(expected) 66 | 67 | 68 | def test__select_two_isotope_elements_dm_2_p0_greater_than_pi_custom_abundance(): 69 | elements = ["Cl", "Br", "N", "O", "P", "S"] 70 | expected = ["Cl"] 71 | # Br abundance adjusted to force the result to be Cl 72 | custom_abundances = {"Br": np.array([0.9, 0.1])} 73 | dm = 2 74 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances) 75 | assert len(res) == len(expected) 76 | assert set(res) == set(expected) 77 | 78 | 79 | def test__select_two_isotope_elements_dm_2_no_elements(): 80 | elements = ["O", "P", "S"] 81 | custom_abundances = {} 82 | dm = 2 83 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances) 84 | assert len(res) == 0 85 | 86 | 87 | def test__select_two_isotope_elements_dm_2_p0_lower_than_pi(): 88 | elements = ["In", "H", "O", "P", "S"] 89 | expected = ["In"] 90 | dm = 2 91 | custom_abundances = dict() 92 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances) 93 | assert len(res) == len(expected) 94 | assert set(res) == set(expected) 95 | 96 | 97 | def test__select_two_isotope_elements_dm_2_p0_lower_and_higher_than_pi(): 98 | elements = ["Cl", "In", "Br", "O", "P", "S"] 99 | expected = ["Br", "In"] 100 | dm = 2 101 | custom_abundances = dict() 102 | res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances) 103 | assert len(res) == len(expected) 104 | assert set(res) == set(expected) 105 | 106 | 107 | def test__select_multiple_isotope_elements(): 108 | elements = ["Cl", "H", "N", "O", "P", "S"] 109 | expected = ["O", "S"] 110 | res = mmi_finder._select_multiple_isotope_elements(elements) 111 | assert len(res) == len(expected) 112 | assert set(res) == set(expected) 113 | 114 | 115 | def test__select_multiple_isotope_elements_no_elements(): 116 | elements = ["Cl", "H", "N", "P"] 117 | expected = [] 118 | res = mmi_finder._select_multiple_isotope_elements(elements) 119 | assert len(res) == len(expected) 120 | assert set(res) == set(expected) 121 | 122 | 123 | @pytest.mark.parametrize( 124 | "elements,expected", 125 | [ 126 | [["C", "H", "N", "O", "P", "S"], ["C", "O", "S"]], 127 | [["C", "H", "N", "O", "P", "S", "Cl", "Li", "Na"], ["C", "O", "S", "Li", "Cl"]], 128 | ], 129 | ) 130 | def test__select_elements(elements, expected): 131 | res = mmi_finder._select_elements(elements) 132 | res = [x.symbol for x in res] 133 | assert len(res) == len(expected) 134 | assert set(res) == set(expected) 135 | 136 | 137 | @pytest.fixture 138 | def rules(): 139 | bounds = {"C": (0, 108), "H": (0, 100), "S": (0, 8), "Cl": (0, 2)} 140 | max_mass = 2000.0 141 | length = 5 142 | bin_size = 100 143 | p_tol = 0.05 144 | r = mmi_finder._create_rules_dict(bounds, max_mass, length, bin_size, p_tol, None) 145 | return r, max_mass, length, bin_size 146 | 147 | 148 | def create_peak_list(mz: list[float], sp: list[float]) -> Sequence[Peak]: 149 | peak_list = list() 150 | size = 30 151 | time = np.linspace(0, size, size) 152 | scan = np.arange(size) 153 | spint = np.ones(size) 154 | for k_mz, k_sp in zip(mz, sp): 155 | roi = LCTrace(time.copy(), spint * k_sp, spint * k_mz, scan) 156 | peak = Peak(10, 15, 20, roi) 157 | peak_list.append(peak) 158 | return peak_list 159 | 160 | 161 | def test__find_candidates(rules): 162 | rules, max_mass, length, bin_size = rules 163 | # create an m/z and sp list where the monoisotopic m/z is the M1 in the 164 | # isotopic envelope. 165 | 166 | _, M_cl, _ = PeriodicTable().get_element("Cl").get_abundances() 167 | dm_cl = M_cl[1] - M_cl[0] 168 | mono_mz = 400.0 169 | charge = 1 170 | mono_index = 3 171 | mz = [100.0, 300.0, mono_mz - dm_cl, mono_mz, 456.0] 172 | sp = [100.0, 200.0, 500.0, 501.0, 34.0] 173 | peak_list = create_peak_list(mz, sp) 174 | monoisotopologue = peak_list[mono_index] 175 | 176 | # find the rule to search the mmi candidate 177 | m_bin = int(mono_mz // bin_size) 178 | i_rules = rules.get(m_bin)[0] 179 | mz_tol = 0.005 180 | p_tol = 0.05 181 | min_similarity = 0.9 182 | 183 | data = AnnotationData(peak_list) 184 | 185 | test_candidates = mmi_finder._find_candidate( 186 | data, monoisotopologue, charge, i_rules, mz_tol, p_tol, max_mass, min_similarity 187 | ) 188 | mmi = peak_list[2] 189 | expected_candidates = [(mmi, 1)] 190 | assert test_candidates == expected_candidates 191 | 192 | 193 | def test__find_candidates_multiple_candidates(rules): 194 | rules, max_mass, length, bin_size = rules 195 | # create an m/z and sp list where the monoisotopic m/z is the M1 in the 196 | # isotopic envelope. 197 | _, M_cl, _ = PeriodicTable().get_element("Cl").get_abundances() 198 | dm_cl = M_cl[1] - M_cl[0] 199 | mono_mz = 400.0 200 | charge = 1 201 | mono_index = 4 202 | M01 = mono_mz - dm_cl 203 | M02 = M01 + 0.00001 204 | mz = [100.0, 300.0, M01, M02, mono_mz, 456.0] 205 | sp = [100.0, 200.0, 500.0, 500.5, 501.0, 34.0] 206 | peak_list = create_peak_list(mz, sp) 207 | monoisotopologue = peak_list[mono_index] 208 | 209 | # find the rule to search the mmi candidate 210 | m_bin = int(mono_mz // bin_size) 211 | i_rules = rules.get(m_bin)[0] 212 | mz_tol = 0.005 213 | p_tol = 0.05 214 | min_similarity = 0.9 215 | 216 | data = AnnotationData(peak_list) 217 | 218 | test_candidates = mmi_finder._find_candidate( 219 | data, monoisotopologue, charge, i_rules, mz_tol, p_tol, max_mass, min_similarity 220 | ) 221 | expected_candidates = [(peak_list[2], 1), (peak_list[3], 1)] 222 | assert test_candidates == expected_candidates 223 | 224 | 225 | def test__find_candidates_no_candidates(rules): 226 | rules, max_mass, length, bin_size = rules 227 | # create an m/z and sp list where the monoisotopic m/z is the M1 in the 228 | # isotopic envelope. 229 | _, M_cl, _ = PeriodicTable().get_element("Cl").get_abundances() 230 | mono_mz = 400.0 231 | charge = 1 232 | mono_index = 2 233 | mz = [100.0, 300.0, mono_mz, 456.0] 234 | sp = [100.0, 200.0, 501.0, 34.0] 235 | peak_list = create_peak_list(mz, sp) 236 | monoisotopologue = peak_list[mono_index] 237 | 238 | # find the rule to search the mmi candidate 239 | m_bin = int(mono_mz // bin_size) 240 | i_rules = rules.get(m_bin)[0] 241 | mz_tol = 0.005 242 | p_tol = 0.05 243 | min_similarity = 0.9 244 | 245 | data = AnnotationData(peak_list) 246 | 247 | test_candidates = mmi_finder._find_candidate( 248 | data, monoisotopologue, charge, i_rules, mz_tol, p_tol, max_mass, min_similarity 249 | ) 250 | assert len(test_candidates) == 0 251 | 252 | 253 | def test_MMIFinder(): 254 | bounds = {"C": (0, 108), "H": (0, 100), "S": (0, 8), "Cl": (0, 2)} 255 | max_mass = 2000.0 256 | length = 5 257 | bin_size = 100 258 | max_charge = 3 259 | mz_tol = 0.005 260 | p_tol = 0.05 261 | min_similarity = 0.9 262 | finder = mmi_finder.MMIFinder( 263 | bounds, max_mass, max_charge, length, bin_size, mz_tol, p_tol, min_similarity 264 | ) 265 | 266 | _, M_cl, _ = PeriodicTable().get_element("Cl").get_abundances() 267 | dm_cl = M_cl[1] - M_cl[0] 268 | mono_mz = 400.0 269 | mz = [100.0, 300.0, mono_mz - dm_cl, mono_mz, 456.0] 270 | sp = [100.0, 200.0, 500.0, 501.0, 34.0] 271 | peak_list = create_peak_list(mz, sp) 272 | data = AnnotationData(peak_list) 273 | monoisotopologue = data.get_monoisotopologue() 274 | test_mmi_index = finder.find(data) 275 | expected_mmi_index = [ 276 | (monoisotopologue, 1), 277 | (monoisotopologue, 2), 278 | (monoisotopologue, 3), 279 | (peak_list[2], 1), 280 | ] 281 | # check with set because features may be in a different order 282 | assert set(test_mmi_index) == set(expected_mmi_index) 283 | -------------------------------------------------------------------------------- /tests/unit/test_batch_corrector.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tidyms import _batch_corrector 3 | # import pytest 4 | from statsmodels.nonparametric.smoothers_lowess import lowess 5 | 6 | 7 | def test_correct_batches(data_container_with_order): 8 | data_matrix = data_container_with_order.data_matrix 9 | sample_metadata = data_container_with_order.sample_metadata 10 | sample_class = ["healthy", "disease"] 11 | qc_class = ["QC"] 12 | _batch_corrector.correct_batches( 13 | data_matrix, 14 | sample_metadata, 15 | sample_class, 16 | qc_class, 17 | verbose=False 18 | ) 19 | assert True 20 | 21 | 22 | def test_correct_batches_frac(data_container_with_order): 23 | # test specifying a frac value 24 | data_matrix = data_container_with_order.data_matrix 25 | sample_metadata = data_container_with_order.sample_metadata 26 | sample_class = ["healthy", "disease"] 27 | qc_class = ["QC"] 28 | _batch_corrector.correct_batches( 29 | data_matrix, 30 | sample_metadata, 31 | sample_class, 32 | qc_class, 33 | frac=0.7, 34 | verbose=False 35 | ) 36 | assert True 37 | 38 | 39 | def test_correct_batches_first_n(data_container_with_order): 40 | # test specifying a frac value 41 | data_matrix = data_container_with_order.data_matrix 42 | sample_metadata = data_container_with_order.sample_metadata 43 | sample_class = ["healthy", "disease"] 44 | qc_class = ["QC"] 45 | _batch_corrector.correct_batches( 46 | data_matrix, 47 | sample_metadata, 48 | sample_class, 49 | qc_class, 50 | first_n=1, 51 | verbose=False 52 | ) 53 | assert True 54 | 55 | 56 | def test_lowess_min_n_samples(): 57 | # check that the if n is lower or equal than 3 lowess return the same value 58 | n = 4 59 | for k in range(2, n): 60 | x = np.arange(k) 61 | y = np.random.normal(size=k) 62 | y_fit = lowess(y, x, is_sorted=True, return_sorted=False) 63 | assert np.allclose(y, y_fit) 64 | 65 | 66 | def test_split_data_matrix(data_container_with_order): 67 | # Test if we can rebuild the matrix from the fragments 68 | data_matrix = data_container_with_order.data_matrix 69 | sample_metadata = data_container_with_order.sample_metadata 70 | sample_class = ["healthy", "disease"] 71 | qc_class = ["QC"] 72 | iterator = _batch_corrector._split_data_matrix( 73 | data_matrix, 74 | sample_metadata, 75 | sample_class, 76 | qc_class, 77 | 0.0 78 | ) 79 | rebuilt = np.zeros(shape=data_matrix.shape, dtype=float) 80 | for start, k, order, xgk, _, _ in iterator: 81 | rebuilt[start + np.arange(xgk.size), k] = xgk.flatten() 82 | assert np.array_equal(data_matrix.to_numpy(), rebuilt) 83 | 84 | 85 | def test_rebuild_data_matrix(data_container_with_order): 86 | # Test if we can rebuild the matrix from the fragments 87 | data_matrix = data_container_with_order.data_matrix 88 | sample_metadata = data_container_with_order.sample_metadata 89 | sample_class = ["healthy", "disease"] 90 | qc_class = ["QC"] 91 | iterator = _batch_corrector._split_data_matrix( 92 | data_matrix, 93 | sample_metadata, 94 | sample_class, 95 | qc_class, 96 | 0.0 97 | ) 98 | 99 | # compute index used to rebuild the matrix but don't modify the values 100 | def process_chunk(args): 101 | start_index, column, order, x, train_index, predict_index = args 102 | index = np.arange(x.size) + start_index 103 | return x, index, column 104 | 105 | chunks = [process_chunk(x) for x in iterator] 106 | shape = data_matrix.shape 107 | rebuilt = _batch_corrector._rebuild_data_matrix(shape, chunks) 108 | X = data_matrix.to_numpy() 109 | assert np.array_equal(X, rebuilt) 110 | 111 | 112 | def test_find_invalid_samples(data_container_with_order): 113 | data = data_container_with_order 114 | sample_metadata = data.sample_metadata 115 | sample_class = data.mapping["sample"] 116 | qc_class = data.mapping["qc"] 117 | invalid_samples = _batch_corrector.find_invalid_samples( 118 | sample_metadata, 119 | sample_class, 120 | qc_class 121 | ) 122 | assert invalid_samples.size == 0 123 | 124 | def test_find_invalid_samples_remove_first_block(data_container_with_order): 125 | # check if study samples with order lower than qc samples are removed 126 | data = data_container_with_order 127 | sample_metadata = data.sample_metadata.copy() 128 | sample_class = data.mapping["sample"] 129 | qc_class = data.mapping["qc"] 130 | # modify one value at the beginning 131 | sample_metadata.at[sample_metadata.index[0], "class"] = sample_class[0] 132 | invalid_samples = _batch_corrector.find_invalid_samples( 133 | sample_metadata, 134 | sample_class, 135 | qc_class 136 | ) 137 | assert invalid_samples.size == 1 138 | 139 | 140 | def test_find_invalid_samples_remove_last_block(data_container_with_order): 141 | # check if study samples with order lower than qc samples are removed 142 | data = data_container_with_order 143 | sample_metadata = data.sample_metadata.copy() 144 | sample_class = data.mapping["sample"] 145 | qc_class = data.mapping["qc"] 146 | # modify one value at the beginning 147 | sample_metadata.at[sample_metadata.index[-1], "class"] = sample_class[0] 148 | invalid_samples = _batch_corrector.find_invalid_samples( 149 | sample_metadata, 150 | sample_class, 151 | qc_class 152 | ) 153 | assert invalid_samples.size == 1 154 | 155 | 156 | def test_find_invalid_samples_invalid_batch( 157 | data_container_with_order_single_qc): 158 | # check if study samples with order lower than qc samples are removed 159 | data = data_container_with_order_single_qc 160 | sample_metadata = data.sample_metadata 161 | sample_class = data.mapping["sample"] 162 | qc_class = data.mapping["qc"] 163 | # the third batch have only two QC samples and must be removed. 164 | n_invalid = sample_metadata["batch"].value_counts()[3] 165 | invalid_samples = _batch_corrector.find_invalid_samples( 166 | sample_metadata, 167 | sample_class, 168 | qc_class 169 | ) 170 | assert invalid_samples.size == n_invalid 171 | 172 | 173 | def test_find_invalid_features(data_container_with_order): 174 | data = data_container_with_order 175 | data_matrix = data.data_matrix 176 | sample_metadata = data.sample_metadata 177 | sample_class = data.mapping["sample"] 178 | qc_class = data.mapping["qc"] 179 | threshold = 0.0 180 | min_detection_rate = 1.0 181 | invalid_features = _batch_corrector.find_invalid_features( 182 | data_matrix, 183 | sample_metadata, 184 | sample_class, 185 | qc_class, 186 | threshold, 187 | min_detection_rate 188 | ) 189 | assert invalid_features.size == 0 190 | 191 | 192 | def test_find_invalid_features_threshold(data_container_with_order): 193 | # using high threshold, all features should be removed 194 | data = data_container_with_order 195 | data_matrix = data.data_matrix 196 | sample_metadata = data.sample_metadata 197 | sample_class = data.mapping["sample"] 198 | qc_class = data.mapping["qc"] 199 | threshold = 10000000.0 200 | min_detection_rate = 1.0 201 | invalid_features = _batch_corrector.find_invalid_features( 202 | data_matrix, 203 | sample_metadata, 204 | sample_class, 205 | qc_class, 206 | threshold, 207 | min_detection_rate 208 | ) 209 | assert invalid_features.size == data_matrix.shape[1] -------------------------------------------------------------------------------- /tests/unit/test_chem/test_atoms.py: -------------------------------------------------------------------------------- 1 | from tidyms.chem import atoms 2 | import pytest 3 | 4 | 5 | def test_PeriodicTable_get_element_from_symbol(): 6 | ptable = atoms.PeriodicTable() 7 | c = ptable.get_element("C") 8 | assert c.z == 6 9 | assert c.symbol == "C" 10 | 11 | 12 | def test_PeriodicTable_get_element_from_z(): 13 | ptable = atoms.PeriodicTable() 14 | p = ptable.get_element(15) 15 | assert p.symbol == "P" 16 | assert p.z == 15 17 | 18 | 19 | def test_PeriodicTable_get_isotope_from_symbol(): 20 | ptable = atoms.PeriodicTable() 21 | cl37 = ptable.get_isotope("37Cl") 22 | assert cl37.a == 37 23 | assert cl37.get_symbol() == "Cl" 24 | 25 | 26 | def test_PeriodicTable_get_isotope_copy(): 27 | ptable = atoms.PeriodicTable() 28 | isotope_str = "37Cl" 29 | cl37_copy = ptable.get_isotope(isotope_str, copy=True) 30 | cl37 = ptable.get_isotope(isotope_str) 31 | assert cl37.a == cl37_copy.a 32 | assert cl37.m == cl37_copy.m 33 | assert cl37.z == cl37_copy.z 34 | assert cl37 is not cl37_copy 35 | 36 | 37 | @pytest.mark.parametrize( 38 | "z,a,m,abundance,expected_symbol", 39 | [ 40 | [6, 12, 12.0, 0.9, "C"], # Carbon. Dummy abundances and exact mass are used. 41 | [1, 1, 1.0078, 0.9, "H"], # Hydrogen 42 | [15, 31, 30.099, 1.0, "P"] # Phosphorus 43 | ] 44 | ) 45 | def test_Isotope_get_symbol(z, a, m, abundance, expected_symbol): 46 | isotope = atoms.Isotope(z, a, m, abundance) 47 | assert isotope.get_symbol() == expected_symbol 48 | 49 | 50 | def test_Element_get_monoisotope(): 51 | element = atoms.PeriodicTable().get_element("B") 52 | monoisotope = element.get_monoisotope() 53 | assert monoisotope.a == 11 54 | 55 | 56 | def test_Element_get_mmi(): 57 | element = atoms.PeriodicTable().get_element("B") 58 | mmi = element.get_mmi() 59 | assert mmi.a == 10 60 | -------------------------------------------------------------------------------- /tests/unit/test_chem/test_formula.py: -------------------------------------------------------------------------------- 1 | from tidyms.chem import formula 2 | from tidyms.chem.atoms import InvalidIsotope, PeriodicTable 3 | import pytest 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "formula_str,p_open,p_close", 8 | [ 9 | ("[Cr[H2O]6]3+", 0, 9), 10 | ("[C9H11NO2]", 0, 9), 11 | ("C9H11N(17O)2", 6, 10), 12 | ("[Cr[(2H)2O]6]3+", 3, 10), 13 | ], 14 | ) 15 | def test_find_matching_parenthesis_valid_input(formula_str, p_open, p_close): 16 | test_p_close = formula._find_matching_parenthesis(formula_str, p_open) 17 | assert test_p_close == p_close 18 | 19 | 20 | @pytest.mark.parametrize( 21 | "formula_str,formula_without_charge,charge", 22 | [ 23 | ("H2O", "H2O", 0), 24 | ("(13C)", "(13C)", 0), 25 | ("[CO3]2-", "CO3", -2), 26 | ("[Cr[H2O]6]3+", "Cr[H2O]6", 3), 27 | ("[C9H11NO2]", "[C9H11NO2]", 0), 28 | ("CO-", "CO", -1), 29 | ("[H2O]+", "[H2O]", 1), 30 | ("H2O+", "H2O", 1), 31 | ], 32 | ) 33 | def test_parse_charge_valid_input(formula_str, formula_without_charge, charge): 34 | test_formula_without_charge, q = formula._parse_charge(formula_str) 35 | assert test_formula_without_charge == formula_without_charge 36 | assert charge == q 37 | 38 | 39 | @pytest.mark.parametrize("formula_str", ["SO42-"]) 40 | def test_parse_charge_invalid_input(formula_str): 41 | with pytest.raises(formula.InvalidFormula): 42 | _, q = formula._parse_charge(formula_str) 43 | 44 | 45 | @pytest.mark.parametrize( 46 | "formula_str,ind,token_type", 47 | [ 48 | ("H2O", 0, 0), 49 | ("H2(34S)O4", 2, 1), 50 | ("[Cr(H2O)6]3+", 3, 2), 51 | ("[Fe[CN]6]4-", 3, 2), 52 | ], 53 | ) 54 | def test_get_token_type(formula_str, ind, token_type): 55 | test_token_type = formula._get_token_type(formula_str, ind) 56 | assert test_token_type == token_type 57 | 58 | 59 | @pytest.mark.parametrize( 60 | "formula_str,ind,coeff,new_ind", 61 | [ 62 | ("H2O", 3, 1, 3), 63 | ("CO2", 1, 1, 1), 64 | ("C9H11NO2", 3, 11, 5), 65 | ] 66 | ) 67 | def test_get_coefficient_valid_input(formula_str, ind, coeff, new_ind): 68 | test_coeff, test_ind = formula._get_coefficient(formula_str, ind) 69 | assert coeff == test_coeff 70 | assert test_ind == new_ind 71 | 72 | 73 | @pytest.mark.parametrize( 74 | "formula_str,ind,new_ind,element", 75 | [ 76 | ("H2O", 0, 2, "H"), 77 | ("H2O", 2, 3, "O"), 78 | ("C9H11NO2", 5, 6, "N"), 79 | ("C9H11N(17O)2", 5, 6, "N"), 80 | ("Cr(H2O)6", 0, 2, "Cr"), 81 | ] 82 | ) 83 | def test_tokenize_element_valid_input(formula_str, ind, new_ind, element): 84 | token, test_index = formula._tokenize_element(formula_str, ind) 85 | assert test_index == new_ind 86 | isotope = PeriodicTable().get_element(element).get_monoisotope() 87 | assert isotope in token 88 | 89 | 90 | @pytest.mark.parametrize( 91 | "formula_str,ind,isotope_str,new_ind", 92 | [ 93 | ("(13C)O2", 0, "13C", 5), 94 | ("C9H11(15N)2O2", 5, "15N", 11), 95 | ("C6H12O5(18O)", 7, "18O", 12), 96 | ("C6H12O4(18O)2", 7, "18O", 13), 97 | ] 98 | ) 99 | def test_tokenize_isotope_valid_input(formula_str, ind, isotope_str, new_ind): 100 | token, test_index = formula._tokenize_isotope(formula_str, ind) 101 | isotope = PeriodicTable().get_isotope(isotope_str) 102 | assert test_index == new_ind 103 | assert isotope in token 104 | 105 | 106 | @pytest.mark.parametrize( 107 | "f_str,composition", 108 | [ 109 | ("H2O", {"1H": 2, "16O": 1}), 110 | ("(13C)O2", {"13C": 1, "16O": 2}), 111 | ("C9H11(15N)2O2", {"12C": 9, "1H": 11, "15N": 2, "16O": 2}), 112 | ("C9H11N2O2", {"12C": 9, "1H": 11, "14N": 2, "16O": 2}), 113 | ("Cr[(2H)2O]6", {"52Cr": 1, "2H": 12, "16O": 6}) 114 | ] 115 | ) 116 | def test_tokenize_formula(f_str, composition): 117 | composition = {PeriodicTable().get_isotope(k): v for k, v in composition.items()} 118 | test_composition = formula._parse_formula(f_str) 119 | for isotope in composition: 120 | assert composition[isotope] == test_composition[isotope] 121 | 122 | 123 | def test_arg_sort_elements(): 124 | symbols = ["Cd", "C", "H", "H", "O", "O", "S", "B"] 125 | a = [60, 12, 2, 1, 16, 17, 32, 7] 126 | sorted_ind = [7, 1, 0, 3, 2, 4, 5, 6] 127 | assert sorted_ind == formula._arg_sort_elements(symbols, a) 128 | 129 | 130 | @pytest.mark.parametrize( 131 | "charge,charge_str", 132 | [ 133 | (1, "+"), 134 | (2, "2+"), 135 | (-1, "-"), 136 | (-4, "4-") 137 | ] 138 | ) 139 | def test_get_charge_str(charge, charge_str): 140 | test_charge_str = formula._get_charge_str(charge) 141 | assert test_charge_str == charge_str 142 | 143 | 144 | @pytest.mark.parametrize( 145 | "f,f_str", 146 | [ 147 | (formula.Formula("CO2"), "CO2"), 148 | (formula.Formula("(13C)C2H6O3"), "C2(13C)H6O3"), 149 | (formula.Formula("C24H46SPN(18O)2"), "C24H46N(18O)2PS"), 150 | (formula.Formula("[Cr(H2O)6]3+"), "[H12CrO6]3+"), 151 | (formula.Formula("CH3CH2CH3"), "C3H8"), 152 | (formula.Formula("F2"), "F2"), 153 | ] 154 | ) 155 | def test_get_formula_str(f, f_str): 156 | test_f_str = str(f) 157 | assert test_f_str == f_str 158 | 159 | 160 | @pytest.mark.parametrize("f_str", ["(CO2", "#H2O"]) 161 | def test_parse_formula_invalid_formula(f_str): 162 | with pytest.raises(formula.InvalidFormula): 163 | formula.Formula(f_str) 164 | 165 | 166 | @pytest.mark.parametrize("f_str", ["(14C)O2", "(3H)2O"]) 167 | def test_parse_formula_invalid_isotope(f_str): 168 | with pytest.raises(InvalidIsotope): 169 | formula.Formula(f_str) 170 | 171 | 172 | @pytest.fixture 173 | def formula_data(): 174 | formula_str = ["CO2", "H2O", "F2"] 175 | nominal = [44, 18, 38] 176 | exact = [43.9898, 18.0106, 37.9968] 177 | return formula_str, nominal, exact 178 | 179 | 180 | def test_get_exact_mass(formula_data): 181 | formula_str, _, exact = formula_data 182 | for f_str, e in zip(formula_str, exact): 183 | assert abs(formula.Formula(f_str).get_exact_mass() - e) < 0.0001 184 | 185 | 186 | def test_get_nominal_mass(formula_data): 187 | formula_str, nominal, _ = formula_data 188 | for f_str, n in zip(formula_str, nominal): 189 | assert formula.Formula(f_str).get_nominal_mass() == n 190 | 191 | 192 | def test_formula_from_dictionary(): 193 | composition = {"C": 1, "17O": 2, "H": 2} 194 | charge = 1 195 | f = formula.Formula(composition, charge) 196 | for k in composition: 197 | assert PeriodicTable().get_isotope(k) in f.composition 198 | assert charge == f.charge 199 | 200 | 201 | def test_formula_from_dictionary_invalid_isotope(): 202 | composition = {"C": 1, "G": 4} 203 | charge = 1 204 | with pytest.raises(InvalidIsotope): 205 | formula.Formula(composition, charge) 206 | 207 | 208 | def test_formula_from_dictionary_invalid_isotope_type(): 209 | composition = {4: 1, "G": 4} 210 | charge = 1 211 | with pytest.raises(ValueError): 212 | formula.Formula(composition, charge) 213 | 214 | 215 | @pytest.mark.parametrize( 216 | "composition,q", 217 | [ 218 | [{"C": -1, "H": 4}, 1], 219 | [{"C": 1, "H": 4}, 0.5], 220 | ] 221 | ) 222 | def test_formula_from_dictionary_invalid_coefficient(composition, q): 223 | with pytest.raises(ValueError): 224 | formula.Formula(composition, q) 225 | 226 | 227 | def test_Formula_add(): 228 | f1 = formula.Formula("H2O") 229 | f2 = formula.Formula("CO2") 230 | f_sum = f1 + f2 231 | expected = formula.Formula("H2CO3") 232 | assert expected == f_sum 233 | 234 | 235 | def test_Formula_add_invalid_type(): 236 | f1 = formula.Formula("H2O") 237 | f2 = "CO2" 238 | with pytest.raises(ValueError): 239 | f1 + f2 240 | 241 | 242 | def test_Formula_subtract_valid(): 243 | f1 = formula.Formula("C6H12O6") 244 | f2 = formula.Formula("CO2") 245 | f_diff = f1 - f2 246 | expected = formula.Formula("C5H12O4") 247 | assert expected == f_diff 248 | 249 | 250 | def test_Formula_subtract_invalid_type(): 251 | f1 = formula.Formula("C6H12O6") 252 | f2 = "CO2" 253 | with pytest.raises(ValueError): 254 | f1 - f2 255 | 256 | 257 | def test_Formula_subtract_valid_zero_coeff(): 258 | f1 = formula.Formula("C4H8O2") 259 | f2 = formula.Formula("CO2") 260 | f_diff = f1 - f2 261 | expected = formula.Formula("C3H8") 262 | assert expected == f_diff 263 | 264 | 265 | def test_Formula_subtract_invalid_coeff(): 266 | f1 = formula.Formula("C4H8O") 267 | f2 = formula.Formula("CO2") 268 | with pytest.raises(ValueError): 269 | f1 - f2 270 | -------------------------------------------------------------------------------- /tests/unit/test_chem/test_isotope_distributions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from tidyms.chem import _envelope_utils as ids 4 | from tidyms.chem import Formula, PeriodicTable 5 | from itertools import product 6 | 7 | 8 | @pytest.mark.parametrize( 9 | "isotope_symbol,n,max_length", 10 | product(["2H", "31P"], [0, 1, 5], [1, 2, 5])) 11 | def test__get_n_isotopes_envelope(isotope_symbol: str, n: int, max_length: int): 12 | isotope = PeriodicTable().get_isotope(isotope_symbol) 13 | M, p = ids._get_n_isotopes_envelope(isotope, n, max_length) 14 | M_expected = np.zeros(max_length) 15 | M_expected[0] = n * isotope.m 16 | p_expected = np.zeros(max_length) 17 | p_expected[0] = 1.0 18 | assert np.array_equal(M, M_expected) 19 | assert np.array_equal(p, p_expected) 20 | 21 | 22 | def test__validate_abundance_valid_value(): 23 | symbol = "C" 24 | c = PeriodicTable().get_element(symbol) 25 | mc, _, _ = c.get_abundances() 26 | p = np.array([0.8, 0.2]) 27 | ids._validate_abundance(p, mc, symbol) 28 | 29 | 30 | def test__validate_abundance_negative_values(): 31 | symbol = "C" 32 | c = PeriodicTable().get_element(symbol) 33 | mc, _, _ = c.get_abundances() 34 | p = np.array([0.8, -0.01]) 35 | with pytest.raises(ValueError): 36 | ids._validate_abundance(p, mc, symbol) 37 | 38 | 39 | def test__validate_abundance_non_normalized(): 40 | symbol = "C" 41 | c = PeriodicTable().get_element(symbol) 42 | mc, _, _ = c.get_abundances() 43 | p = np.array([0.8, 0.21]) 44 | with pytest.raises(ValueError): 45 | ids._validate_abundance(p, mc, symbol) 46 | 47 | 48 | def test__validate_abundance_invalid_length(): 49 | symbol = "C" 50 | c = PeriodicTable().get_element(symbol) 51 | mc, _, _ = c.get_abundances() 52 | p = np.array([0.8, 0.015, 0.05]) 53 | with pytest.raises(ValueError): 54 | ids._validate_abundance(p, mc, symbol) 55 | 56 | 57 | @pytest.mark.parametrize( 58 | "n_isotopes,n", 59 | [[1, 1], [1, 2], [1, 5], [1, 10], [2, 1], [2, 5], [2, 20], [5, 1], [5, 10]] 60 | ) 61 | def test__find_n_isotopes_combination(n_isotopes, n): 62 | comb = ids._find_n_isotope_combination(n_isotopes, n) 63 | expected = [x for x in product(range(n + 1), repeat=n_isotopes) if sum(x) == n] 64 | expected = np.array(expected) 65 | # check that the row content is equal 66 | for x in expected: 67 | assert x in comb 68 | for x in comb: 69 | assert x in expected 70 | 71 | 72 | @pytest.mark.parametrize( 73 | "element,max_length", 74 | product(["C", "S"], [2, 5, 10])) 75 | def test__get_n_atoms_envelope_aux_n_1(element: str, max_length: int): 76 | element = PeriodicTable().get_element(element) 77 | me, Me, pe = element.get_abundances() 78 | M, p = ids._get_n_atoms_envelope_aux(me, Me, pe, 1, max_length) 79 | Me, pe = ids._fill_missing_nominal(me, Me, pe, max_length) 80 | assert np.allclose(M, Me) 81 | assert np.allclose(p, pe / np.sum(pe)) 82 | 83 | 84 | def test__get_n_atoms_envelope_aux_c_n_3_max_length_3(): 85 | element = PeriodicTable().get_element("C") 86 | m_c12 = 12 87 | m_c13 = element.isotopes[13].m 88 | me, Me, pe = element.get_abundances() 89 | n = 3 90 | max_length = 3 91 | M, p = ids._get_n_atoms_envelope_aux(me, Me, pe, n, max_length) 92 | M_expected = np.array([3 * m_c12, 2 * m_c12 + m_c13, 12 + 2 * m_c13]) 93 | assert np.allclose(M, M_expected) 94 | assert np.allclose(np.sum(pe), 1.0) 95 | 96 | 97 | def test__get_n_atoms_envelope_aux_c_n_3_max_length_5(): 98 | element = PeriodicTable().get_element("C") 99 | m_c12 = 12 100 | m_c13 = element.isotopes[13].m 101 | me, Me, pe = element.get_abundances() 102 | n = 3 103 | max_length = 5 104 | M, p = ids._get_n_atoms_envelope_aux(me, Me, pe, n, max_length) 105 | M_expected = np.array([3 * m_c12, 2 * m_c12 + m_c13, 12 + 2 * m_c13, 3 * m_c13, 0]) 106 | assert np.allclose(M, M_expected) 107 | assert np.allclose(np.sum(pe), 1.0) 108 | 109 | 110 | def test__get_n_atoms_envelope_aux_s_n_2_max_length_3(): 111 | element = PeriodicTable().get_element("S") 112 | me, Me, pe = element.get_abundances() 113 | n = 2 114 | max_length = 3 115 | M, p = ids._get_n_atoms_envelope_aux(me, Me, pe, n, max_length) 116 | assert np.array_equal(M.round().astype(int), np.array([64, 65, 66])) 117 | assert np.allclose(np.sum(pe), 1.0) 118 | 119 | 120 | def test__get_n_atoms_envelope_aux_s_n_2_max_length_10(): 121 | element = PeriodicTable().get_element("S") 122 | me, Me, pe = element.get_abundances() 123 | n = 2 124 | max_length = 10 125 | M, p = ids._get_n_atoms_envelope_aux(me, Me, pe, n, max_length) 126 | M_rounded = np.array([64, 65, 66, 67, 68, 69, 70, 0, 72, 0]) 127 | assert np.array_equal(M.round().astype(int), M_rounded) 128 | assert np.allclose(np.sum(pe), 1.0) 129 | 130 | 131 | def test__get_n_atoms_envelope(): 132 | element = PeriodicTable().get_element("C") 133 | c12 = element.isotopes[12] 134 | me, Me, pe = element.get_abundances() 135 | M, p = ids._get_n_atoms_envelope(c12, 1, 2) 136 | assert np.allclose(M, Me) 137 | assert np.allclose(p, pe) 138 | 139 | 140 | def test__get_n_atoms_envelope_custom_abundance(): 141 | element = PeriodicTable().get_element("C") 142 | c12 = element.isotopes[12] 143 | me, Me, pe = element.get_abundances() 144 | pe = np.array([0.8, 0.2]) 145 | M, p = ids._get_n_atoms_envelope(c12, 1, 2, p=pe) 146 | assert np.allclose(M, Me) 147 | assert np.allclose(p, pe) 148 | 149 | 150 | def test__fill_missing_nominal_no_fill(): 151 | # carbon element do not need to feel missing values. 152 | max_length = 5 153 | m = np.array([24, 25, 26, 0, 0]) 154 | M = np.array([24.1, 24.2, 24.3, 0, 0]) 155 | p = np.array([0.5, 0.3, 0.2, 0, 0]) 156 | M_fill, p_fill = ids._fill_missing_nominal(m, M, p, max_length) 157 | assert np.allclose(M_fill, M) 158 | assert np.allclose(p_fill, p) 159 | 160 | 161 | def test__fill_missing_nominal_fill(): 162 | # Cl does not have an M + 1 isotope and must be filled. 163 | max_length = 5 164 | m = np.array([105, 107, 109]) 165 | M = np.array([105.1, 107.2, 109.3]) 166 | p = np.array([0.5, 0.3, 0.2]) 167 | M_fill, p_fill = ids._fill_missing_nominal(m, M, p, max_length) 168 | M_expected = np.array([M[0], 0, M[1], 0, M[2]]) 169 | p_expected = np.array([p[0], 0, p[1], 0, p[2]]) 170 | assert np.allclose(M_fill, M_expected) 171 | assert np.allclose(p_fill, p_expected) 172 | 173 | 174 | def test__combine_envelopes_one_row_array(): 175 | c12 = PeriodicTable().get_isotope("12C") 176 | max_length = 10 177 | n1 = 2 178 | n2 = 5 179 | n = n1 + n2 180 | M1, p1 = ids._get_n_atoms_envelope(c12, n1, max_length) 181 | M1 = M1.reshape((1, M1.size)) 182 | p1 = p1.reshape((1, p1.size)) 183 | M2, p2 = ids._get_n_atoms_envelope(c12, n2, max_length) 184 | M2 = M2.reshape((1, M1.size)) 185 | p2 = p2.reshape((1, p1.size)) 186 | M, p = ids.combine_envelopes(M1, p1, M2, p2) 187 | M_expected, p_expected = ids._get_n_atoms_envelope(c12, n, max_length) 188 | M_expected = M_expected.reshape((1, M_expected.size)) 189 | p_expected = p_expected.reshape((1, p_expected.size)) 190 | assert np.allclose(M, M_expected) 191 | assert np.allclose(p, p_expected) 192 | 193 | 194 | def test__combine_envelopes_multiple_row_array(): 195 | c12 = PeriodicTable().get_isotope("12C") 196 | n_rep = 5 197 | max_length = 10 198 | n1 = 2 199 | n2 = 5 200 | n = n1 + n2 201 | M1, p1 = ids._get_n_atoms_envelope(c12, n1, max_length) 202 | M1 = np.tile(M1, (n_rep, 1)) 203 | p1 = np.tile(p1, (n_rep, 1)) 204 | M2, p2 = ids._get_n_atoms_envelope(c12, n2, max_length) 205 | M2 = np.tile(M2, (n_rep, 1)) 206 | p2 = np.tile(p2, (n_rep, 1)) 207 | M, p = ids.combine_envelopes(M1, p1, M2, p2) 208 | M_expected, p_expected = ids._get_n_atoms_envelope(c12, n, max_length) 209 | M_expected = np.tile(M_expected, (n_rep, 1)) 210 | p_expected = np.tile(p_expected, (n_rep, 1)) 211 | assert np.allclose(M, M_expected) 212 | assert np.allclose(p, p_expected) 213 | 214 | 215 | def test_find_formula_abundances(): 216 | f = Formula("CO2") 217 | max_length = 10 218 | ids.find_formula_envelope(f.composition, max_length) 219 | -------------------------------------------------------------------------------- /tests/unit/test_chem/test_isotope_scorer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from tidyms.chem import EnvelopeScorer, EnvelopeValidator 4 | from tidyms.chem import Formula, get_chnops_bounds 5 | 6 | 7 | formula_str_list = ["C11H12N2O2", "C6H12O6", "C27H46O", "CO2", "HCOOH"] 8 | 9 | 10 | @pytest.mark.parametrize("f_str", formula_str_list) 11 | def test_EnvelopeValidator_find_valid_bounds(f_str): 12 | max_length = 5 13 | bounds = get_chnops_bounds(500) 14 | validator = EnvelopeValidator(bounds, max_length=max_length) 15 | f = Formula(f_str) 16 | M, p = f.get_isotopic_envelope(max_length) 17 | tolerance = 0.005 18 | validator.generate_envelopes(M, p, tolerance) 19 | # results are not strictly equal due to being computed using a subset 20 | # of elements in the validator 21 | # a tolerance is used to check validity in M and p 22 | p_tol = 0.0001 23 | M_tol = 0.0000001 24 | for k in range(M.size): 25 | min_M, max_M, min_p, max_p = validator._find_bounds(k) 26 | assert min_M - M_tol < M[k] < max_M + M_tol 27 | assert min_p - p_tol < p[k] < max_p + p_tol 28 | 29 | 30 | @pytest.mark.parametrize("f_str", formula_str_list) 31 | def test_EnvelopeValidator_validate(f_str): 32 | max_length = 5 33 | bounds = get_chnops_bounds(500) 34 | validator = EnvelopeValidator(bounds, max_length=max_length) 35 | f = Formula(f_str) 36 | M, p = f.get_isotopic_envelope(max_length) 37 | validated_length = validator.validate(M, p) 38 | assert validated_length == max_length 39 | 40 | 41 | def test_EnvelopeValidator_validate_invalid_envelope(): 42 | max_length = 5 43 | bounds = get_chnops_bounds(500) 44 | validator = EnvelopeValidator(bounds, max_length=max_length) 45 | f = Formula("C2H8B") 46 | M, p = f.get_isotopic_envelope(max_length) 47 | validated_length = validator.validate(M, p) 48 | expected_length = 0 49 | assert validated_length == expected_length 50 | 51 | 52 | @pytest.mark.parametrize("f_str", formula_str_list) 53 | def test_EnvelopeScorer(f_str): 54 | # test that the best scoring candidate has the same molecular formula 55 | f = Formula(f_str) 56 | max_length = 5 57 | bounds = get_chnops_bounds(500) 58 | chnops_scorer = EnvelopeScorer(bounds, max_length=max_length) 59 | M, p = f.get_isotopic_envelope(max_length) 60 | tolerance = 0.005 61 | chnops_scorer.score(M, p, tolerance) 62 | coeff, isotopes, score = chnops_scorer.get_top_results(5) 63 | expected_coeff = [f.composition[x] for x in isotopes] 64 | assert np.array_equal(expected_coeff, coeff[0]) 65 | 66 | 67 | @pytest.mark.parametrize("f_str", formula_str_list) 68 | def test_EnvelopeScorer_length_gt_scorer_max_length(f_str): 69 | # test that the best scoring candidate has the same molecular formula 70 | f = Formula(f_str) 71 | max_length = 3 72 | bounds = get_chnops_bounds(500) 73 | chnops_scorer = EnvelopeScorer(bounds, max_length=max_length) 74 | M, p = f.get_isotopic_envelope(max_length + 1) 75 | tolerance = 0.005 76 | 77 | with pytest.raises(ValueError): 78 | chnops_scorer.score(M, p, tolerance) 79 | coeff, isotopes, score = chnops_scorer.get_top_results(5) 80 | expected_coeff = [f.composition[x] for x in isotopes] 81 | assert np.array_equal(expected_coeff, coeff[0]) 82 | 83 | 84 | @pytest.mark.parametrize("f_str", formula_str_list) 85 | def test_EnvelopeScorer_custom_scorer(f_str): 86 | 87 | def cosine_scorer(mz1, ab1, mz2, ab2, **scorer_params): 88 | n1 = np.linalg.norm(ab1) 89 | n2 = np.linalg.norm(ab2) 90 | norm = np.linalg.norm(ab1 - ab2) 91 | cosine = norm / (n1 * n2) 92 | return 1 - cosine 93 | 94 | f = Formula(f_str) 95 | max_length = 5 96 | M, p = f.get_isotopic_envelope(max_length) 97 | bounds = get_chnops_bounds(500) 98 | envelope_scorer = EnvelopeScorer(bounds, scorer=cosine_scorer, max_length=max_length) 99 | tolerance = 0.005 100 | envelope_scorer.score(M, p, tolerance) 101 | coeff, isotopes, score = envelope_scorer.get_top_results(5) 102 | expected_coeff = [f.composition[x] for x in isotopes] 103 | assert np.array_equal(expected_coeff, coeff[0]) 104 | 105 | 106 | @pytest.fixture 107 | def positive_elements_scorer(): 108 | bounds = {"C": (0, 10), "H": (0, 10), "N": (0, 10)} 109 | return EnvelopeScorer(bounds, max_length=5) 110 | 111 | 112 | @pytest.mark.parametrize("f_str", ["C2H3N", "N2H4", "C3N3H3"]) 113 | def test_EnvelopeScorer_positive_defect_elements_only(f_str, positive_elements_scorer): 114 | f = Formula(f_str) 115 | max_length = positive_elements_scorer.max_length 116 | M, p = f.get_isotopic_envelope(max_length) 117 | tolerance = 0.005 118 | positive_elements_scorer.score(M, p, tolerance) 119 | coeff, isotopes, score = positive_elements_scorer.get_top_results(5) 120 | expected_coeff = [f.composition[x] for x in isotopes] 121 | assert np.array_equal(expected_coeff, coeff[0]) 122 | 123 | 124 | @pytest.fixture 125 | def negative_elements_scorer(): 126 | bounds = {"C": (0, 10), "O": (0, 10), "S": (0, 10)} 127 | return EnvelopeScorer(bounds, max_length=5) 128 | 129 | 130 | @pytest.mark.parametrize("f_str", ["CS2", "C2OS2", "C3SO"]) 131 | def test_EnvelopeScorer_negative_defect_elements_only(f_str, negative_elements_scorer): 132 | f = Formula(f_str) 133 | max_length = negative_elements_scorer.max_length 134 | M, p = f.get_isotopic_envelope(max_length) 135 | tolerance = 0.001 136 | negative_elements_scorer.score(M, p, tolerance) 137 | coeff, isotopes, score = negative_elements_scorer.get_top_results(5) 138 | expected_coeff = [f.composition[x] for x in isotopes] 139 | assert np.array_equal(expected_coeff, coeff[0]) 140 | 141 | 142 | @pytest.fixture 143 | def no_carbon_scorer(): 144 | bounds = {"H": (0, 10), "O": (0, 5), "S": (0, 5), "P": (0, 5)} 145 | return EnvelopeScorer(bounds, max_length=5) 146 | 147 | 148 | @pytest.mark.parametrize("f_str", ["H2O", "H3PO4", "H2SO4"]) 149 | def test_EnvelopeScorer_no_carbon(f_str, no_carbon_scorer): 150 | f = Formula(f_str) 151 | max_length = no_carbon_scorer.max_length 152 | M, p = f.get_isotopic_envelope(max_length) 153 | tolerance = 0.005 154 | no_carbon_scorer.score(M, p, tolerance) 155 | coeff, isotopes, score = no_carbon_scorer.get_top_results(5) 156 | expected_coeff = [f.composition[x] for x in isotopes] 157 | assert np.array_equal(expected_coeff, coeff[0]) 158 | -------------------------------------------------------------------------------- /tests/unit/test_consensus_annotation.py: -------------------------------------------------------------------------------- 1 | from tidyms import consensus_annotation 2 | from tidyms import _constants as c 3 | import pandas as pd 4 | import pytest 5 | from collections import Counter 6 | 7 | @pytest.fixture 8 | def feature_table(): 9 | # Three feature labels, all belonging to the same envelope 10 | # rows with -1 are noise. 11 | columns = [c.SAMPLE, c.LABEL, c.ENVELOPE_LABEL, c.ENVELOPE_INDEX, c.CHARGE] 12 | data = [ 13 | [0, -1, -1, -1, -1], 14 | [1, -1, -1, -1, -1], 15 | [2, -1, -1, -1, -1], 16 | [0, 0, 0, 0, 1], 17 | [1, 0, 0, 0, 1], 18 | [2, 0, 0, 0, 1], 19 | [3, 0, 0, 1, 1], 20 | [4, 0, 0, 0, 2], 21 | [0, 1, 0, 1, 1], 22 | [1, 1, 0, 1, 1], 23 | [2, 1, 0, 1, 1], 24 | [3, 1, 0, 2, 1], 25 | [4, 1, 0, 1, 2], 26 | [0, 2, 0, 2, 1], 27 | [1, 2, 0, 2, 1], 28 | [2, 2, 0, 2, 1], 29 | [3, 2, 0, 2, 1], 30 | [4, 2, 0, 2, 2], 31 | ] 32 | return pd.DataFrame(data=data, columns=columns) 33 | 34 | 35 | def test__build_graph(feature_table): 36 | graph, annotations = consensus_annotation.vote_annotations(feature_table) 37 | assert len(annotations) == 3 38 | for ft_label, ft_data in annotations.items(): 39 | assert ft_data[c.CHARGE] == 1 40 | assert ft_data[c.ENVELOPE_LABEL] == 0 41 | assert ft_data[c.ENVELOPE_INDEX] == ft_label 42 | 43 | 44 | def test__build_graph_nodes(feature_table): 45 | nodes = consensus_annotation._build_graph_nodes(feature_table) 46 | expected = { 47 | 0: {c.CHARGE: 1, c.ENVELOPE_INDEX: 0}, 48 | 1: {c.CHARGE: 1, c.ENVELOPE_INDEX: 1}, 49 | 2: {c.CHARGE: 1, c.ENVELOPE_INDEX: 2} 50 | } 51 | assert nodes == expected 52 | 53 | def test__build_graph_edges(feature_table): 54 | edges = consensus_annotation._build_graph_edges(feature_table) 55 | edge_count = Counter(edges) 56 | expected = Counter({(0, 1): 4, (0, 2): 4}) 57 | assert edge_count == expected 58 | -------------------------------------------------------------------------------- /tests/unit/test_correspondence.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from tidyms import correspondence 4 | from tidyms import _constants as c 5 | import pytest 6 | from sklearn.cluster import DBSCAN 7 | 8 | 9 | # test make_initial_cluster 10 | 11 | @pytest.mark.parametrize( 12 | "n,k,max_size", 13 | [[20, 2, 10], [100, 4, 125], [200, 25, 1500], [200, 10, 20000]] 14 | ) 15 | def test_make_initial_cluster(n, k, max_size): 16 | # n is the number of samples 17 | # k is the number of clusters 18 | # test with several sample sizes and check that the result is the same 19 | # as using DBSCAN without data split 20 | X1 = np.arange(n) 21 | X2 = np.arange(n) 22 | X = np.vstack((X1, X2)).T 23 | X = np.repeat(X, k, axis=0) 24 | X = np.random.permutation(X) 25 | # k cluster, no noise should be present 26 | eps = 0.1 27 | min_samples = round(n * 0.2) 28 | test_cluster = correspondence._cluster_dbscan( 29 | X, eps, min_samples, max_size 30 | ) 31 | dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric="chebyshev") 32 | dbscan.fit(X) 33 | expected_cluster = dbscan.labels_ 34 | assert np.array_equal(test_cluster, expected_cluster) 35 | 36 | 37 | # test estimate n species 38 | 39 | @pytest.mark.parametrize( 40 | "min_samples,expected", 41 | [[1, np.array([2, 2])], [2, np.array([2, 2])], [3, np.array([0, 0])]]) 42 | def test_estimate_n_species_one_class(min_samples, expected): 43 | samples = np.array( 44 | [0] * 4 + [1] * 4 # 8 features detected in total in two samples 45 | ) 46 | clusters = np.array( 47 | ([0] * 2 + [1] * 2) * 2 # two clusters 48 | ) 49 | n_clusters = 2 50 | # two species in two clusters are expected 51 | res = correspondence._estimate_n_species_one_class( 52 | samples, clusters, min_samples, n_clusters 53 | ) 54 | assert np.array_equal(res, expected) 55 | 56 | 57 | def test_estimate_n_species_multiple_groups(): 58 | samples = np.array( 59 | [0] * 4 + [1] * 4 + [2] * 4 # 12 features in three samples 60 | ) 61 | clusters = np.array( 62 | ([0] * 2 + [1] * 2) * 3 # two clusters 63 | ) 64 | classes = np.array( 65 | [0] * 8 + [1] * 4 # two groups 66 | ) 67 | min_dr = 0.5 68 | # two species in two clusters are expected 69 | expected = {0: 2, 1: 2} 70 | include_classes = [0, 1] 71 | samples_per_class = {0: 2, 1: 1} 72 | 73 | res = correspondence._estimate_n_species( 74 | samples, clusters, classes, samples_per_class, include_classes, min_dr) 75 | assert res == expected 76 | 77 | 78 | # test _get_min_samples 79 | 80 | @pytest.fixture 81 | def samples_per_class(): 82 | res = { 83 | 0: 8, 84 | 1: 16, 85 | 2: 24 86 | } 87 | return res 88 | 89 | 90 | def test_get_min_samples_include_classes_none(samples_per_class): 91 | min_fraction = 0.25 92 | include_classes = None 93 | test_min_samples = correspondence._get_min_sample( 94 | samples_per_class, include_classes, min_fraction) 95 | expected_min_samples = round(sum(samples_per_class.values()) * min_fraction) 96 | assert expected_min_samples == test_min_samples 97 | 98 | 99 | def test_get_min_samples_include_classes(samples_per_class): 100 | min_fraction = 0.25 101 | include_classes = [0, 1] 102 | test_min_samples = correspondence._get_min_sample( 103 | samples_per_class, include_classes, min_fraction) 104 | n_include = [v for k, v in samples_per_class.items()if k in include_classes] 105 | expected_min_samples = round(min(n_include) * min_fraction) 106 | assert expected_min_samples == test_min_samples 107 | 108 | 109 | def test_process_cluster_one_species(): 110 | np.random.seed(1234) 111 | # features 112 | n = 200 113 | X = np.random.normal(size=(n, 2)) 114 | samples = np.arange(n) 115 | 116 | # add noise 117 | n_noise = 10 118 | noise = np.random.normal(size=(n_noise, 2), loc=4) 119 | X = np.vstack((X, noise)) 120 | s_noise = np.random.choice(samples, size=n_noise) 121 | samples = np.hstack((samples, s_noise)) 122 | 123 | expected = np.array([0] * n + [-1] * n_noise) 124 | 125 | n_species = 1 126 | max_deviation = 4 127 | labels, score = correspondence._process_cluster( 128 | X, samples, n_species, max_deviation) 129 | assert np.array_equal(labels, expected) 130 | 131 | 132 | def test_process_cluster_two_species(): 133 | np.random.seed(1234) 134 | # features 135 | n = 200 136 | x_list = list() 137 | s_list = list() 138 | for loc in [0, 4]: 139 | x_list.append(np.random.normal(size=(n, 2), loc=loc)) 140 | s_list.append(np.arange(n)) 141 | 142 | # add noise 143 | n_noise = 10 144 | x_list.append(np.random.normal(size=(n_noise, 2), loc=8)) 145 | X = np.vstack(x_list) 146 | s_list.append(np.random.choice(s_list[0], size=n_noise)) 147 | samples = np.hstack(s_list) 148 | 149 | expected = np.array([0] * n + [1] * n + [-1] * n_noise) 150 | 151 | n_species = 2 152 | max_deviation = 4 153 | labels, score = correspondence._process_cluster( 154 | X, samples, n_species, max_deviation) 155 | assert np.array_equal(labels, expected) 156 | 157 | 158 | def test_match_features(): 159 | np.random.seed(1234) 160 | # features 161 | n = 200 162 | x_list = list() 163 | s_list = list() 164 | for loc in [0, 4]: 165 | x_list.append(np.random.normal(size=(n, 2), loc=loc)) 166 | s_list.append(np.arange(n)) 167 | 168 | # add noise 169 | n_noise = 10 170 | x_list.append(np.random.normal(size=(n_noise, 2), loc=8)) 171 | X = np.vstack(x_list) 172 | s_list.append(np.random.choice(s_list[0], size=n_noise)) 173 | samples = np.hstack(s_list) 174 | 175 | feature_table = pd.DataFrame(X, columns=["mz", "rt"]) 176 | feature_table[c.SAMPLE] = samples 177 | feature_table[c.CLASS] = 0 178 | samples_per_class = {0: 200} 179 | 180 | expected = np.array([0] * n + [1] * n + [-1] * n_noise) 181 | 182 | labels = correspondence.match_features( 183 | feature_table, samples_per_class, None, 2, 2, 0.25, 4, verbose=True) 184 | labels = labels[c.LABEL] 185 | 186 | assert np.array_equal(labels, expected) 187 | -------------------------------------------------------------------------------- /tests/unit/test_fileio.py: -------------------------------------------------------------------------------- 1 | from tidyms import fileio 2 | from tidyms.utils import get_tidyms_path 3 | import os 4 | import pytest 5 | 6 | 7 | def test_read_mzmine(): 8 | dataset_name = "test-mzmine" 9 | cache_path = get_tidyms_path() 10 | data_path = os.path.join(cache_path, dataset_name) 11 | data_matrix_path = os.path.join(data_path, "data.csv") 12 | sample_metadata_path = os.path.join(data_path, "sample.csv") 13 | try: 14 | fileio.read_mzmine(data_matrix_path, sample_metadata_path) 15 | except FileNotFoundError: 16 | fileio.download_dataset(dataset_name) 17 | fileio.read_mzmine(data_matrix_path, sample_metadata_path) 18 | assert True 19 | 20 | 21 | def test_read_progenesis(): 22 | # progenesis data is contained in one file 23 | dataset_name = "test-progenesis" 24 | cache_path = get_tidyms_path() 25 | data_path = os.path.join(cache_path, dataset_name) 26 | data_matrix_path = os.path.join(data_path, "data.csv") 27 | try: 28 | fileio.read_progenesis(data_matrix_path) 29 | except FileNotFoundError: 30 | fileio.download_dataset(dataset_name) 31 | fileio.read_progenesis(data_matrix_path) 32 | assert True 33 | 34 | 35 | def test_read_xcms(): 36 | dataset_name = "test-xcms" 37 | cache_path = get_tidyms_path() 38 | data_path = os.path.join(cache_path, dataset_name) 39 | data_matrix_path = os.path.join(data_path, "data.csv") 40 | sample_metadata_path = os.path.join(data_path, "sample.csv") 41 | feature_metadata_path = os.path.join(data_path, "feature.csv") 42 | try: 43 | fileio.read_xcms(data_matrix_path, feature_metadata_path, 44 | sample_metadata_path) 45 | except FileNotFoundError: 46 | fileio.download_dataset(dataset_name) 47 | fileio.read_xcms(data_matrix_path, feature_metadata_path, 48 | sample_metadata_path) 49 | assert True 50 | 51 | 52 | def test_read_compressed_indexed_mzml(centroid_mzml): 53 | n_spectra = centroid_mzml.get_n_spectra() 54 | n_chromatogram = centroid_mzml.get_n_chromatograms() 55 | 56 | # test spectra 57 | for k in range(n_spectra): 58 | centroid_mzml.get_spectrum(k) 59 | 60 | # test chromatogram 61 | for k in range(n_chromatogram): 62 | centroid_mzml.get_chromatogram(k) 63 | 64 | assert True 65 | 66 | 67 | def test_read_uncompressed_indexed_mzml(): 68 | cache_path = get_tidyms_path() 69 | filename = "centroid-data-indexed-uncompressed.mzML" 70 | data_path = os.path.join(cache_path, "test-raw-data", filename) 71 | ms_data = fileio.MSData.create_MSData_instance(data_path) 72 | n_spectra = ms_data.get_n_spectra() 73 | n_chromatogram = ms_data.get_n_chromatograms() 74 | 75 | # test spectra 76 | for k in range(n_spectra): 77 | ms_data.get_spectrum(k) 78 | 79 | # test chromatogram 80 | for k in range(n_chromatogram): 81 | ms_data.get_n_chromatograms() 82 | 83 | assert True 84 | 85 | 86 | def test_read_compressed_no_index_mzml(): 87 | cache_path = get_tidyms_path() 88 | filename = "centroid-data-zlib-no-index-compressed.mzML" 89 | data_path = os.path.join(cache_path, "test-raw-data", filename) 90 | ms_data = fileio.MSData.create_MSData_instance(data_path) 91 | n_spectra = ms_data.get_n_spectra() 92 | n_chromatogram = ms_data.get_n_chromatograms() 93 | 94 | # test spectra 95 | for k in range(n_spectra): 96 | ms_data.get_spectrum(k) 97 | 98 | # test chromatogram 99 | for k in range(n_chromatogram): 100 | ms_data.get_n_chromatograms() 101 | 102 | assert True 103 | 104 | 105 | def test_get_spectra_iterator_start(centroid_mzml): 106 | start = 9 107 | sp_iterator = centroid_mzml.get_spectra_iterator(start=start) 108 | for scan, sp in sp_iterator: 109 | assert scan >= start 110 | 111 | 112 | def test_get_spectra_iterator_end(centroid_mzml): 113 | expected_end = 20 114 | sp_iterator = centroid_mzml.get_spectra_iterator(end=expected_end) 115 | for scan, sp in sp_iterator: 116 | assert scan < expected_end 117 | 118 | 119 | def test_get_spectra_iterator_ms_level(centroid_mzml): 120 | expected_ms_level = 2 121 | sp_iterator = centroid_mzml.get_spectra_iterator(ms_level=expected_ms_level) 122 | for scan, sp in sp_iterator: 123 | assert sp.ms_level == expected_ms_level 124 | 125 | 126 | def test_get_spectra_iterator_start_time(centroid_mzml): 127 | start_time = 10 128 | sp_iterator = centroid_mzml.get_spectra_iterator(start_time=start_time) 129 | for scan, sp in sp_iterator: 130 | assert sp.time >= start_time 131 | 132 | 133 | def test_get_spectra_iterator_end_time(centroid_mzml): 134 | end_time = 20 135 | sp_iterator = centroid_mzml.get_spectra_iterator(end_time=end_time) 136 | for scan, sp in sp_iterator: 137 | assert sp.time < end_time 138 | 139 | 140 | def test_centroids(profile_mzml): 141 | profile_mzml.get_spectrum(0).find_centroids() 142 | assert True 143 | 144 | 145 | def test_load_dataset(): 146 | for d in fileio.list_available_datasets(): 147 | fileio.load_dataset(d) 148 | 149 | 150 | def test_load_dataset_invalid_dataset(): 151 | with pytest.raises(ValueError): 152 | fileio.load_dataset("invalid-dataset") 153 | -------------------------------------------------------------------------------- /tests/unit/test_fill_missing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tidyms as ms 3 | 4 | 5 | def test_get_fill_area_no_peaks_detected(monkeypatch): 6 | time = np.arange(100) 7 | spint = np.ones_like(time) 8 | chromatogram = ms.Chromatogram(time, spint) 9 | rt = 50 10 | rt_std = 10 11 | n_dev = 1 12 | 13 | def mock_extract_features(self, **kwargs): 14 | self.features = list() 15 | 16 | monkeypatch.setattr(ms.Chromatogram, "extract_features", mock_extract_features) 17 | 18 | area = ms.fill_missing._get_fill_area(chromatogram, rt, rt_std, n_dev) 19 | assert area is None 20 | 21 | 22 | def test_get_fill_area_peak_detected_outside_valid_range(monkeypatch): 23 | time = np.arange(100) 24 | spint = np.ones_like(time) 25 | chromatogram = ms.Chromatogram(time, spint) 26 | rt = 50 27 | rt_std = 10 28 | n_dev = 1 29 | 30 | def mock_extract_features(self, **kwargs): 31 | self.features = [ms.lcms.Peak(70, 75, 80, self)] 32 | 33 | monkeypatch.setattr(ms.Chromatogram, "extract_features", mock_extract_features) 34 | 35 | area = ms.fill_missing._get_fill_area(chromatogram, rt, rt_std, n_dev) 36 | assert area is None 37 | 38 | 39 | def test_get_fill_area_peak_detected_inside_valid_range(monkeypatch): 40 | time = np.arange(100) 41 | spint = np.ones_like(time) 42 | chromatogram = ms.Chromatogram(time, spint) 43 | chromatogram.baseline = np.zeros_like(time) 44 | rt = 50 45 | rt_std = 10 46 | n_dev = 1 47 | test_peak = ms.lcms.Peak(50, 55, 60, chromatogram) 48 | expected_area = test_peak.get_area() 49 | 50 | def mock_extract_features(self, **kwargs): 51 | self.features = [test_peak] 52 | 53 | monkeypatch.setattr(ms.Chromatogram, "extract_features", mock_extract_features) 54 | 55 | area = ms.fill_missing._get_fill_area(chromatogram, rt, rt_std, n_dev) 56 | assert np.isclose(area, expected_area) 57 | 58 | 59 | def test_get_fill_area_multiple_valid_peaks_choose_closest(monkeypatch): 60 | time = np.arange(100) 61 | spint = np.ones_like(time) 62 | chromatogram = ms.Chromatogram(time, spint) 63 | chromatogram.baseline = np.zeros_like(time) 64 | rt = 50 65 | rt_std = 10 66 | n_dev = 1 67 | valid_peak = ms.lcms.Peak(45, 50, 52, chromatogram) 68 | detected_peaks = [valid_peak, ms.lcms.Peak(55, 60, 65, chromatogram)] 69 | expected_area = valid_peak.get_area() 70 | 71 | def mock_extract_features(self, **kwargs): 72 | self.features = detected_peaks 73 | 74 | monkeypatch.setattr(ms.Chromatogram, "extract_features", mock_extract_features) 75 | 76 | area = ms.fill_missing._get_fill_area(chromatogram, rt, rt_std, n_dev) 77 | assert np.isclose(area, expected_area) 78 | -------------------------------------------------------------------------------- /tests/unit/test_filter.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import tidyms as ms 4 | 5 | 6 | def test_class_remover(data_container_with_order): 7 | rm = ["QC"] 8 | data = data_container_with_order 9 | n_qc_samples = (data.sample_metadata["class"] == 'QC').sum() 10 | n_samples = data.data_matrix.shape[0] 11 | crm = ms.filter.ClassRemover(rm) 12 | crm.process(data) 13 | assert data.data_matrix.shape[0] == (n_samples - n_qc_samples) 14 | 15 | 16 | def test_class_remover_invalid_class(data_container_with_order): 17 | rm = ["invalid_class"] 18 | data = data_container_with_order 19 | crm = ms.filter.ClassRemover(rm) 20 | n_samples = data.data_matrix.shape[0] 21 | crm.process(data) 22 | assert data.data_matrix.shape[0] == n_samples 23 | 24 | 25 | # def test_prevalence_filter_remove_none(data_container_with_order): 26 | # data = data_container_with_order 27 | # process_classes = None 28 | # lb = 0 29 | # ub = 1 30 | # intraclass = True 31 | # threshold = 0 32 | # pf = ms.filter.PrevalenceFilter(process_classes=process_classes, lb=lb, 33 | # ub=ub, intraclass=intraclass, 34 | # threshold=threshold) 35 | # pf.process(data) 36 | # assert True 37 | # 38 | # 39 | # def test_prevalence_filter_remove_one_feature(data_container_with_order): 40 | # data = data_container_with_order 41 | # rm_ft = "FT01" 42 | # data._data_matrix.loc[:, rm_ft] = 0 43 | # process_classes = None 44 | # lb = 0.1 45 | # ub = 1 46 | # intraclass = True 47 | # threshold = 0 48 | # pf = ms.filter.PrevalenceFilter(process_classes=process_classes, 49 | # lb=lb, 50 | # ub=ub, 51 | # intraclass=intraclass, 52 | # threshold=threshold) 53 | # pf.process(data) 54 | # assert rm_ft in pf.remove 55 | # 56 | # 57 | # def test_blank_filter_custom_func(data_container_with_order): 58 | # data = data_container_with_order 59 | # bc = ms.filter.BlankCorrector(mode=lambda x: 20) 60 | # bc.process(data) 61 | # assert (data._data_matrix[data.classes 62 | # .isin(bc.params["process_classes"])] == 0).all().all() 63 | # 64 | # 65 | # def test_variation_filter(data_container_with_order): 66 | # data = data_container_with_order 67 | # vf = ms.filter.VariationFilter(lb=0, 68 | # ub=0.2, 69 | # process_classes=None) 70 | # vf.process(data) 71 | # print(vf.remove) 72 | # assert vf.remove.empty 73 | -------------------------------------------------------------------------------- /tests/unit/test_peaks.py: -------------------------------------------------------------------------------- 1 | import tidyms as ms 2 | import numpy as np 3 | import pytest 4 | from scipy.signal.windows import gaussian 5 | from scipy.special import erfc 6 | from scipy.ndimage import gaussian_filter1d 7 | # from itertools import product 8 | 9 | # random seed 10 | SEED = 1234 11 | 12 | 13 | # noise estimation tests 14 | 15 | @pytest.fixture 16 | def noise(): 17 | sigma = 1.0 18 | np.random.seed(SEED) 19 | return np.random.normal(size=500, scale=sigma), sigma 20 | 21 | 22 | def test_estimate_local_noise_empty_signal(): 23 | x = np.array([]) 24 | noise = ms.peaks._estimate_local_noise(x) 25 | assert np.isclose(noise, 0.0) 26 | 27 | 28 | @pytest.mark.parametrize("x", [np.array([1]), np.array([1, 2])]) 29 | def test_estimate_local_noise_signal_length_lower_than_two(x): 30 | noise = ms.peaks._estimate_local_noise(x) 31 | assert np.isclose(noise, 0.0) 32 | 33 | 34 | def test_estimate_local_noise(noise): 35 | # check that the noise estimation is close to the std of a normal 36 | # distribution 37 | x, sigma = noise 38 | noise_estimation = ms.peaks._estimate_local_noise(x) 39 | # noise should be close to sigma, check with a 20 % tolerance 40 | assert (sigma < 1.2 * noise_estimation) 41 | 42 | 43 | def test_estimate_local_noise_non_robust(noise): 44 | x, sigma = noise 45 | noise_estimation = ms.peaks._estimate_local_noise(x, robust=False) 46 | # noise should be close to sigma, check with a 20 % tolerance 47 | assert (sigma < 1.2 * noise_estimation) 48 | 49 | 50 | def test_estimate_noise_empty_array(): 51 | x = np.array([]) 52 | noise = ms.peaks.estimate_noise(x) 53 | assert noise.size == 0.0 54 | 55 | 56 | @pytest.mark.parametrize("x", [np.array([1]), np.array([1, 3]), 57 | np.array([1, 4, 6])]) 58 | def test_estimate_noise_signal_length_lower_than_two(x): 59 | noise_estimation = ms.peaks.estimate_noise(x) 60 | assert np.allclose(noise_estimation, 0.0) 61 | 62 | 63 | def test_estimate_noise_check_size(noise): 64 | noise, sigma = noise 65 | noise_estimation = ms.peaks.estimate_noise(noise, n_slices=2) 66 | assert noise.size == noise_estimation.size 67 | 68 | 69 | def test_estimate_noise_n_slices(noise): 70 | noise, sigma = noise 71 | noise_estimation = ms.peaks.estimate_noise(noise, n_slices=2) 72 | size = noise.size 73 | half = size // 2 74 | # check that the noise estimation was done for 2 slices 75 | assert np.allclose(noise_estimation[:half], noise_estimation[0]) 76 | assert np.allclose(noise_estimation[half:], noise_estimation[half]) 77 | # check that the estimation on each slice is different 78 | assert noise_estimation[0] != noise_estimation[half] 79 | 80 | 81 | def test_estimate_noise_min_slice_size(noise): 82 | noise, sigma = noise 83 | n_slices = 5 84 | min_slice_size = 150 85 | noise_estimation = ms.peaks.estimate_noise(noise, n_slices=n_slices, 86 | min_slice_size=min_slice_size) 87 | # noise has a size of 500, the slice is going to be 100 < 150 88 | # check that 150 is used instead. 89 | slice_boundaries = [0, 150, 300, 500] # the last slice is extended to 200 90 | # to prevent the creation of a slice of size 50 91 | for k in range(len(slice_boundaries) - 1): 92 | start = slice_boundaries[k] 93 | end = slice_boundaries[k + 1] 94 | assert np.allclose(noise_estimation[start:end], noise_estimation[start]) 95 | 96 | 97 | # Test baseline estimation 98 | 99 | def test_find_local_extrema(): 100 | x = np.arange(10) 101 | # reflect and merge the concatenate x. local extrema should be 0, 9, 19 102 | x = np.hstack((x, x[::-1])) 103 | test_output = ms.peaks._find_local_extrema(x) 104 | expected_output = [0, 9, 19] 105 | assert np.array_equal(test_output, expected_output) 106 | 107 | 108 | def test_find_local_extrema_no_local_maximum(): 109 | x = np.arange(10) 110 | test_output = ms.peaks._find_local_extrema(x) 111 | expected_output = np.array([]) 112 | assert np.array_equal(test_output, expected_output) 113 | 114 | 115 | test_noise_sum_params = [[np.array([0, 1]), np.sqrt([25, 25])], 116 | [np.array([0]), np.sqrt([34])]] 117 | 118 | 119 | @pytest.mark.parametrize("index,expected", test_noise_sum_params) 120 | def test_get_noise_sum_slice_std(index, expected): 121 | index = np.array(index) 122 | expected = np.array(expected) 123 | x = np.array([3, 4, 2, 2, 1]) 124 | test_output = ms.peaks._get_noise_slice_sum_std(x, index) 125 | assert np.allclose(test_output, expected) 126 | 127 | 128 | def test_estimate_noise_probability(): 129 | noise = np.ones(7) 130 | x = np.array([0, 0.1, 0.4, 2, 1.25, 1.1, 1.0]) 131 | extrema = np.array([0, 3, 6]) 132 | # two slices of size 4 and 2 respectively, the expected output should 133 | # be erfc(1/sqrt(2) and erfc(1) 134 | expected_output = erfc([2.5 * np.sqrt(1 / 2) / 2, 135 | 1.35 * np.sqrt(1 / 2) / 2]) 136 | test_output = ms.peaks._estimate_noise_probability(noise, x, extrema) 137 | assert np.allclose(expected_output, test_output) 138 | 139 | 140 | def test_build_baseline_index(): 141 | x = np.array([0, 1, 2, 1, 0, 1, 2, 1, 0, 1, 2, 1, 0]) 142 | extrema = np.array([0, 2, 4, 6, 8, 10, 12]) 143 | noise_probability = np.array([0, 0.25, 0.25, 0.25, 0, 0]) 144 | min_proba = 0.05 145 | expected = np.array([0, 4, 5, 6, 12]) 146 | test = ms.peaks._build_baseline_index(x, noise_probability, min_proba, 147 | extrema) 148 | assert np.array_equal(expected, test) 149 | 150 | 151 | def test_estimate_baseline(): 152 | # a simple test, a noise array is built using a noise level greater 153 | # than the noise level in the signal. All points should be classified as 154 | # baseline 155 | n = 100 156 | x = np.random.normal(size=n, scale=1) 157 | noise = np.ones(n) * 5 158 | baseline = ms.peaks.estimate_baseline(x, noise) 159 | expected_baseline_index = np.arange(n) 160 | test_baseline_index = np.where(np.abs(x - baseline) < noise)[0] 161 | assert np.array_equal(expected_baseline_index, test_baseline_index) 162 | 163 | 164 | @pytest.fixture 165 | def single_peak(noise): 166 | noise, sigma = noise 167 | x = gaussian(noise.size, 2) * 20 168 | return x 169 | 170 | 171 | @pytest.fixture 172 | def two_non_overlapping_peaks(noise): 173 | noise, sigma = noise 174 | x = np.arange(noise.size) 175 | params = np.array([[100, 2, 50], [150, 2, 25]]) 176 | y = ms.utils.gaussian_mixture(x, params).sum(axis=0) 177 | return y, params 178 | 179 | 180 | def test_detect_peaks_one_peak(single_peak, noise): 181 | noise, sigma = noise 182 | x = single_peak + noise 183 | noise_estimation = ms.peaks.estimate_noise(x) 184 | # smooth x to reduce the number of detected peaks 185 | x = gaussian_filter1d(x, 1.0) 186 | baseline_estimation = ms.peaks.estimate_baseline(x, noise) 187 | peaks = ms.peaks.detect_peaks(x, noise_estimation, baseline_estimation) 188 | assert len(peaks[0]) == 1 189 | 190 | 191 | def test_detect_peaks_two_non_overlapping_peaks(two_non_overlapping_peaks, 192 | noise): 193 | noise, sigma = noise 194 | x, _ = two_non_overlapping_peaks 195 | x = x + noise 196 | noise_estimation = ms.peaks.estimate_noise(x) 197 | # smooth x to reduce the number of detected peaks 198 | x = gaussian_filter1d(x, 1.0) 199 | baseline_estimation = ms.peaks.estimate_baseline(x, noise) 200 | peaks = ms.peaks.detect_peaks(x, noise_estimation, baseline_estimation) 201 | assert len(peaks[0]) == 2 202 | 203 | 204 | @pytest.fixture 205 | def two_overlapping_peaks(noise): 206 | noise, sigma = noise 207 | x = np.arange(noise.size) 208 | params = np.array([[100, 2, 50], [108, 2, 25]]) 209 | y = ms.utils.gaussian_mixture(x, params).sum(axis=0) 210 | return y, params 211 | 212 | 213 | def test_detect_peaks_two_overlapping_peaks(two_overlapping_peaks, noise): 214 | noise, sigma = noise 215 | x, _ = two_overlapping_peaks 216 | x = x + noise 217 | noise_estimation = ms.peaks.estimate_noise(x) 218 | # smooth x to reduce the number of detected peaks 219 | x = gaussian_filter1d(x, 1.0) 220 | baseline_estimation = ms.peaks.estimate_baseline(x, noise) 221 | peaks = ms.peaks.detect_peaks(x, noise_estimation, baseline_estimation) 222 | start, apex, end = peaks 223 | # only two peaks are detected 224 | assert len(start) == 2 225 | # check the boundary of the overlapping peaks 226 | assert end[0] == (start[1] + 1) 227 | -------------------------------------------------------------------------------- /tests/unit/test_validation.py: -------------------------------------------------------------------------------- 1 | from tidyms.validation import * 2 | import pytest 3 | 4 | 5 | @pytest.fixture 6 | def example_validator(): 7 | schema = { 8 | "positive_number": {"is_positive": True}, 9 | "a": {"lower_than": "b"}, 10 | "b": {"lower_or_equal": "c"}, 11 | "c": {"type": "number"}, 12 | "some_function": {"check_with": is_callable} 13 | } 14 | return ValidatorWithLowerThan(schema) 15 | 16 | 17 | def test_is_positive_positive_number(example_validator): 18 | params = {"positive_number": 5} 19 | validate(params, example_validator) 20 | assert True 21 | 22 | 23 | def test_is_positive_zero(example_validator): 24 | params = {"positive_number": 0} 25 | with pytest.raises(ValueError): 26 | validate(params, example_validator) 27 | 28 | 29 | def test_is_positive_negative_number(example_validator): 30 | params = {"positive_number": -1} 31 | with pytest.raises(ValueError): 32 | validate(params, example_validator) 33 | 34 | 35 | def test_lower_than_valid(example_validator): 36 | # a must be lower than b 37 | params = {"a": 5, "b": 6} 38 | validate(params, example_validator) 39 | assert True 40 | 41 | 42 | def test_lower_than_invalid(example_validator): 43 | # a must be lower than b 44 | params = {"a": 5, "b": 4} 45 | with pytest.raises(ValueError): 46 | validate(params, example_validator) 47 | 48 | 49 | def test_lower_than_invalid_equal(example_validator): 50 | # a must be lower than b 51 | params = {"a": 5, "b": 5} 52 | with pytest.raises(ValueError): 53 | validate(params, example_validator) 54 | 55 | 56 | def test_lower_or_equal_valid(example_validator): 57 | # a must be lower than b 58 | params = {"b": 5, "c": 7} 59 | validate(params, example_validator) 60 | assert True 61 | 62 | 63 | def test_lower_or_equal_valid_equal(example_validator): 64 | # a must be lower than b 65 | params = {"b": 5, "c": 5} 66 | validate(params, example_validator) 67 | assert True 68 | 69 | 70 | def test_lower_or_equal_invalid(example_validator): 71 | # a must be lower than b 72 | params = {"b": 5, "c": 4} 73 | with pytest.raises(ValueError): 74 | validate(params, example_validator) 75 | 76 | 77 | def test_is_callable_valid(example_validator): 78 | # a must be lower than b 79 | params = {"some_function": sum} 80 | validate(params, example_validator) 81 | assert True 82 | 83 | 84 | def test_is_callable_invalid(example_validator): 85 | # a must be lower than b 86 | params = {"some_function": "invalid_value"} 87 | with pytest.raises(ValueError): 88 | validate(params, example_validator) 89 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = 3 | python3.9,python3.10 4 | 5 | [testenv] 6 | deps= -rtest_requirements.txt 7 | commands=pytest --------------------------------------------------------------------------------