├── .coveragerc
├── .flake8
├── .gitignore
├── .readthedocs.yml
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── conftest.py
├── docs
    ├── Makefile
    ├── _templates
    │   └── autosummary
    │   │   ├── base.rst
    │   │   ├── class.rst
    │   │   └── module.rst
    ├── api.rst
    ├── bokeh_plots.py
    ├── chem.rst
    ├── conf.py
    ├── data-curation.rst
    ├── descriptors.csv
    ├── examples
    │   ├── DARTMS_MTBLS1198_SeaOmics__processing.ipynb
    │   ├── DARTMS_processing.ipynb
    │   ├── DARTMS_processing_ParameterOptimization.ipynb
    │   ├── custom_peak_descriptors.py
    │   ├── defined_spots_supervised.tsv
    │   └── roi-creation.py
    ├── feature-correspondence.rst
    ├── fileio.rst
    ├── fileio_tutorial.rst
    ├── glossary.rst
    ├── index.rst
    ├── installation.rst
    ├── mzml.rst
    ├── peak-picking.rst
    ├── plots
    │   ├── dbscan-clustering.py
    │   ├── dbscan-parameters.py
    │   ├── gmm-clustering.py
    │   ├── peak-definition.py
    │   ├── peak-detection-example.py
    │   ├── peak_detection_baseline_example.py
    │   └── roi-definition.py
    ├── preprocessing-steps.csv
    ├── processing_datasets.rst
    ├── quickstart.rst
    ├── requirements.txt
    └── tutorials.rst
├── pyproject.toml
├── requirements.txt
├── src
    └── tidyms
    │   ├── __init__.py
    │   ├── _batch_corrector.py
    │   ├── _build_data_matrix.py
    │   ├── _constants.py
    │   ├── _filter_functions.py
    │   ├── _mzml.py
    │   ├── _plot_bokeh.py
    │   ├── annotation
    │       ├── __init__.py
    │       ├── annotation.py
    │       ├── annotation_data.py
    │       ├── envelope_finder.py
    │       └── mmi_finder.py
    │   ├── assay.py
    │   ├── chem
    │       ├── __init__.py
    │       ├── _envelope_utils.py
    │       ├── _formula_generator.py
    │       ├── atoms.py
    │       ├── elements.json
    │       ├── envelope_tools.py
    │       ├── formula.py
    │       ├── isotopes.json
    │       └── utils.py
    │   ├── consensus_annotation.py
    │   ├── container.py
    │   ├── correspondence.py
    │   ├── dartms.py
    │   ├── fileio.py
    │   ├── fill_missing.py
    │   ├── filter.py
    │   ├── lcms.py
    │   ├── peaks.py
    │   ├── raw_data_utils.py
    │   ├── simulation.py
    │   ├── utils.py
    │   └── validation.py
├── test_requirements.txt
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── integration
    │   ├── test_assay_real_data.py
    │   └── test_real_raw_data.py
    └── unit
    │   ├── annotation
    │       ├── test_annotation.py
    │       ├── test_envelope_finder.py
    │       └── test_mmi_finder.py
    │   ├── test_assay.py
    │   ├── test_batch_corrector.py
    │   ├── test_build_data_matrix.py
    │   ├── test_chem
    │       ├── test_atoms.py
    │       ├── test_formula.py
    │       ├── test_formula_generator.py
    │       ├── test_isotope_distributions.py
    │       └── test_isotope_scorer.py
    │   ├── test_consensus_annotation.py
    │   ├── test_correspondence.py
    │   ├── test_data_container.py
    │   ├── test_fileio.py
    │   ├── test_fill_missing.py
    │   ├── test_filter.py
    │   ├── test_lcms.py
    │   ├── test_peaks.py
    │   ├── test_raw_data_utils.py
    │   ├── test_utils.py
    │   └── test_validation.py
└── tox.ini


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | omit =
 3 |     tidyms/validation.py
 4 |     tidyms/_plot_bokeh.py
 5 | 
 6 | [report]
 7 | exclude_lines =
 8 |     def plot
 9 |     pragma: no cover
10 |     def __repr__


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 88
3 | extend-ignore = E203, E501


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # C generated files by Cython
 10 | *.c
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | docs/generated/
 75 | docs/_static/*.html
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don’t work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # celery beat schedule file
 98 | celerybeat-schedule
 99 | 
100 | # SageMath parsed files
101 | *.sage.py
102 | 
103 | # Environments
104 | .env
105 | .venv
106 | env/
107 | venv/
108 | ENV/
109 | env.bak/
110 | venv.bak/
111 | 
112 | # Spyder project settings
113 | .spyderproject
114 | .spyproject
115 | 
116 | # Rope project settings
117 | .ropeproject
118 | 
119 | # mkdocs documentation
120 | /site
121 | 
122 | # mypy
123 | .mypy_cache/
124 | .dmypy.json
125 | dmypy.json
126 | 
127 | # Pyre type checker
128 | .pyre/
129 | 
130 | # Pycharm
131 | .idea/
132 | 
133 | # VS code
134 | .vscode/
135 | *.featureML
136 | *.dill
137 | docs/examples/exportedDataMatrix.tsv
138 | docs/examples/defined_spots_rtShifted.tsv
139 | docs/examples/defined_spots.tsv
140 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sphinx:
 4 |   configuration: docs/conf.py
 5 | 
 6 | build:
 7 |   os: ubuntu-20.04
 8 |   tools:
 9 |     python: "3.9"
10 | 
11 | python:
12 |   install:
13 |     - requirements: docs/requirements.txt


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2020, Bioanalytical mass spectrometry group at CIBION-CONICET
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | include tidyms/chem/elements.json
4 | include tidyms/chem/isotopes.json


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # make file for pytest
 2 | 
 3 | .PHONY: test-unit
 4 | test-unit:
 5 | 	pytest --cov=tidyms tests/unit
 6 | 
 7 | .PHONY: test-all
 8 | test-all:
 9 | 	pytest --cov=tidyms
10 | 
11 | .PHONY: coverage
12 | coverage:
13 | 	pytest --cov=tidyms && coverage html


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | TidyMS: Tools for working with MS data in metabolomics
 2 | ======================================================
 3 | 
 4 | TidyMS is a python library for processing Mass Spectrometry data. It aims to
 5 | provide easy to use tools to read, process and visualize MS data generated in
 6 | metabolomic studies.
 7 | 
 8 | Features
 9 | --------
10 | 
11 | TidyMS provides functionality to:
12 | 
13 | 1. Read raw MS data in the mzML format
14 | 2. Spectrum and chromatogram creation.
15 | 3. Powerful and flexible peak picking functions optimized for chromatographic
16 |    and spectral data.
17 | 4. Feature detection and feature correspondence in LC-MS data.
18 | 5. Reading processed data in a variety of formats (XCMS, MZMine2, ...)
19 | 5. Data matrix curation using widely accepted guidelines from the metabolomics
20 |    community.
21 | 6. Interactive visualizations of raw and processed data using Bokeh, or
22 |    publication quality plots using seaborn.
23 | 
24 | Installation
25 | ------------
26 | 
27 | The latest release can be installed from PyPI:
28 | 
29 | ```
30 |     pip install tidyms
31 | ```
32 | 
33 | Examples
34 | --------
35 | 
36 | Jupyter notebooks with examples are available
37 | [here](https://github.com/griquelme/tidyms-notebooks).
38 | 
39 | Tests
40 | -----
41 | 
42 | TidyMS uses unit tests for most of its functionality. 
43 | The tests can be executed with 
44 | ```
45 |     python setup.py test
46 | ```
47 | 
48 | Documentation
49 | -------------
50 | 
51 | The official documentation is available at 
52 | [readthedocs](https://tidyms.readthedocs.io/en/latest/).
53 | 
54 | 
55 | Citation
56 | --------
57 | 
58 | If you find TidyMS useful, we would appreciate citations:
59 | 
60 | Riquelme, G.; Zabalegui, N.; Marchi, P.; Jones, C.M.; Monge, M.E. A Python-Based
61 | Pipeline for Preprocessing LC–MS Data for Untargeted Metabolomics Workflows.
62 | _Metabolites_ **2020**, 10, 416, doi:10.3390/metabo10100416.
63 | 
64 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/griquelme/tidyms/ad9356a099f367076f745406be23bb4c50003239/conftest.py


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_templates/autosummary/base.rst:
--------------------------------------------------------------------------------
 1 | .. raw:: html
 2 | 
 3 |     </div>
 4 |     <div class=col-md-9 content>
 5 | 
 6 | {{ fullname | escape | underline}}
 7 | 
 8 | .. currentmodule:: {{ module }}
 9 | 
10 | .. auto{{ objtype }}:: {{ objname }}
11 | 
12 | 


--------------------------------------------------------------------------------
/docs/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
 1 | .. raw:: html
 2 | 
 3 |     </div>
 4 |     <div class=col-md-9 content>
 5 | 
 6 | {{ fullname | escape | underline}}
 7 | 
 8 | .. currentmodule:: {{ module }}
 9 | 
10 | 
11 | .. autoclass:: {{ name }}
12 |     :members:
13 | 
14 | 


--------------------------------------------------------------------------------
/docs/_templates/autosummary/module.rst:
--------------------------------------------------------------------------------
1 | {{ fullname }}
2 | {{ underline }}
3 | 
4 | .. automodule:: {{fullname}}
5 |     :members:
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | .. _api:
 2 | 
 3 | .. py:currentmodule:: tidyms
 4 | 
 5 | API reference
 6 | =============
 7 | 
 8 | Tools for working with raw data
 9 | -------------------------------
10 | 
11 | .. autosummary::
12 |     :toctree: generated
13 | 
14 |     tidyms.Assay
15 |     tidyms.MSData
16 |     tidyms.Chromatogram
17 |     tidyms.MSSpectrum
18 | 
19 | Tools for working with processed data
20 | -------------------------------------
21 | 
22 | .. autosummary::
23 |     :toctree: generated
24 | 
25 |     tidyms.DataContainer
26 |     tidyms.filter.Pipeline
27 | 
28 | List of available filters and processors
29 | ----------------------------------------
30 | 
31 | .. autosummary::
32 |     :toctree: generated
33 | 
34 |     tidyms.filter.BatchCorrector
35 |     tidyms.filter.BlankCorrector
36 |     tidyms.filter.ClassRemover
37 |     tidyms.filter.DilutionFilter
38 |     tidyms.filter.DRatioFilter
39 |     tidyms.filter.PrevalenceFilter
40 |     tidyms.filter.VariationFilter
41 | 
42 | Tools for working with chemical data
43 | ------------------------------------
44 | 
45 | .. autosummary::
46 |     :toctree: generated
47 | 
48 |     tidyms.chem.Formula
49 |     tidyms.chem.PeriodicTable
50 |     tidyms.chem.FormulaGenerator
51 |     tidyms.chem.EnvelopeScorer
52 | 
53 | Module reference
54 | ----------------
55 | 
56 | .. autosummary::
57 |     :toctree: generated
58 | 
59 |     tidyms.container
60 |     tidyms.correspondence
61 |     tidyms.fileio
62 |     tidyms.filter
63 |     tidyms.lcms
64 |     tidyms.peaks
65 |     tidyms.raw_data_utils
66 |     tidyms.utils
67 |     tidyms.chem.atoms
68 |     tidyms.chem.envelope_tools
69 |     tidyms.chem.formula
70 | 
71 |     tidyms.dartms


--------------------------------------------------------------------------------
/docs/bokeh_plots.py:
--------------------------------------------------------------------------------
  1 | from bokeh import plotting
  2 | import tidyms as ms
  3 | import numpy as np
  4 | from pathlib import Path
  5 | 
  6 | 
  7 | seed = 1234
  8 | 
  9 | 
 10 | def create_chromatogram() -> ms.Chromatogram:
 11 | 
 12 |     filename = "NZ_20200227_039.mzML"
 13 |     dataset = "test-nist-raw-data"
 14 |     ms.fileio.download_tidyms_data(dataset, [filename])
 15 |     path = Path(ms.fileio.get_tidyms_path())
 16 |     path = path.joinpath(dataset, filename)
 17 | 
 18 |     ms_data = ms.MSData.create_MSData_instance(
 19 |         path,
 20 |         ms_mode="centroid",
 21 |         instrument="qtof",
 22 |         separation="uplc"
 23 |     )
 24 |     mz_list = np.array([189.0734])
 25 |     return ms.make_chromatograms(ms_data, mz_list)[0]
 26 | 
 27 | 
 28 | def plot_chromatogram():
 29 |     plotting.output_file("_static/chromatogram.html")
 30 |     chromatogram = create_chromatogram()
 31 |     p = chromatogram.plot(show=False)
 32 |     plotting.save(p)
 33 | 
 34 | 
 35 | def plot_chromatogram_with_peaks():
 36 |     # generate always the same plot
 37 |     plotting.output_file("_static/chromatogram-with-peaks.html")
 38 |     chromatogram = create_chromatogram()
 39 |     chromatogram.extract_features()
 40 |     p = chromatogram.plot(show=False)
 41 |     plotting.save(p)
 42 | 
 43 | 
 44 | def feature_plot():
 45 |     plotting.output_file("_static/feature-plot.html")
 46 |     data = ms.fileio.load_dataset("reference-materials")
 47 |     ignore = ["Z", "SV", "B", "SSS", "SCQC"]
 48 |     # search [M+H]+ from trp in the features
 49 |     mz = 205.097
 50 |     rt = 124
 51 |     # get a list of features compatible with the given m/z and rt
 52 |     ft_name = data.select_features(mz, rt)
 53 | 
 54 |     f = data.plot.feature(ft_name[0], draw=False, ignore_classes=ignore)
 55 |     plotting.save(f)
 56 | 
 57 | 
 58 | def pca_plot():
 59 |     plotting.output_file("_static/pca-scores.html")
 60 | 
 61 |     data = ms.fileio.load_dataset("reference-materials")
 62 |     ignore = ["Z", "SV", "B", "SSS", "SCQC"]
 63 |     f = data.plot.pca_scores(fig_params={"height": 250},
 64 |                              ignore_classes=ignore,
 65 |                              scaling="autoscaling",
 66 |                              draw=False)
 67 |     plotting.save(f)
 68 | 
 69 | 
 70 | def create_assay(assay_path) -> ms.Assay:
 71 |     plotting.output_file("_static/pca-scores.html")
 72 |     ms.fileio.download_dataset("test-nist-raw-data")
 73 |     ms.fileio.download_dataset("reference-materials")
 74 |     tidyms_dir = Path(ms.utils.get_tidyms_path())
 75 |     data_path = tidyms_dir.joinpath("test-nist-raw-data")
 76 |     sample_metadata_path = data_path.joinpath("sample_list.csv")
 77 | 
 78 |     assay = ms.Assay(
 79 |         data_path=data_path,
 80 |         assay_path=assay_path,
 81 |         sample_metadata=sample_metadata_path,
 82 |         separation="uplc",
 83 |         instrument="qtof"
 84 |     )
 85 |     return assay
 86 | 
 87 | 
 88 | def plot_roi_assay(assay: ms.Assay, save_path: str):
 89 |     plotting.output_file(save_path)
 90 |     sample_name = "NZ_20200227_039"
 91 |     p = assay.plot.roi(sample_name, show=False)
 92 |     plotting.save(p)
 93 | 
 94 | 
 95 | def plot_stacked_chromatogram(assay: ms.Assay):
 96 |     plotting.output_file("_static/stacked-chromatograms.html")
 97 |     p = assay.plot.stacked_chromatogram(6, show=False)
 98 |     plotting.save(p)
 99 | 
100 | 
101 | def create_assay_plots():
102 |     assay_path = "_build/test-assay"
103 |     assay = create_assay(assay_path)
104 |     mz_list = np.array(
105 |         [118.0654, 144.0810, 146.0605, 181.0720, 188.0706, 189.0738,
106 |          195.0875, 205.0969]
107 |     )
108 |     make_roi_params = {
109 |         "tolerance": 0.015,
110 |         "min_intensity": 5000,
111 |         "targeted_mz": mz_list,
112 |     }
113 |     assay.detect_features(verbose=False, **make_roi_params)
114 |     plot_roi_assay(assay, "_static/roi-no-peaks.html")
115 |     assay.extract_features(store_smoothed=True, verbose=False)
116 |     assay.describe_features(verbose=False)
117 |     assay.build_feature_table()
118 |     assay.match_features(verbose=False)
119 |     plot_roi_assay(assay, "_static/roi-peaks.html")
120 |     plot_stacked_chromatogram(assay)
121 | 
122 | 
123 | def create_plots():
124 |     plot_chromatogram()
125 |     plot_chromatogram_with_peaks()
126 |     feature_plot()
127 |     pca_plot()
128 |     create_assay_plots()


--------------------------------------------------------------------------------
/docs/chem.rst:
--------------------------------------------------------------------------------
  1 | .. _working-with-chemical-formulas:
  2 | 
  3 | .. py:currentmodule:: tidyms
  4 | 
  5 | Chemical data utilities
  6 | =======================
  7 | 
  8 | The `chem` module contains utilities to work with chemical data such as isotopes,
  9 | elements and formulas. Also, it contain utilities to generate formulas from
 10 | exact mass, score isotopic envelopes and search isotopic envelope candidates
 11 | from a list of m/z values.
 12 | 
 13 | Searching chemical data
 14 | -----------------------
 15 | 
 16 | :func:`~tidyms.chem.PeriodicTable` contains element and isotope information.
 17 | The ``get_element`` method returns a :class:`~tidyms.chem.atom.Element`
 18 | 
 19 | .. code-block:: python
 20 | 
 21 |     >>> import tidyms as ms
 22 |     >>> ptable = ms.chem.PeriodicTable()
 23 |     >>> oxygen = ptable.get_element("O")
 24 |     >>> oxygen
 25 |     Element(O)
 26 | 
 27 | Element information can be retrieved easily:
 28 | 
 29 | .. code-block:: python
 30 | 
 31 |     >>> oxygen.z
 32 |     8
 33 |     >>> oxygen.symbol
 34 |     "O"
 35 |     >>> oxygen.isotopes
 36 |     {16: Isotope(16O), 17: Isotope(17O), 18: Isotope(18O)}
 37 |     >>> oxygen.get_monoisotope()
 38 |     Isotope(16O)
 39 |     >>> oxygen.get_abundances()
 40 |     (array([16, 17, 18]),
 41 |      array([15.99491462, 16.9991317 , 17.999161  ]),
 42 |      array([9.9757e-01, 3.8000e-04, 2.0500e-03]))
 43 | 
 44 | :class:`~tidyms.chem.atom.Isotope` store exact mass, nominal mass and abundance
 45 | of each isotope:
 46 | 
 47 | .. code-block:: python
 48 | 
 49 |     >>> o16 = oxygen.get_monoisotope()
 50 |     >>> o16.m
 51 |     15.99491462
 52 |     >>> o16.a
 53 |     16
 54 |     >>> o16.p
 55 |     0.99757
 56 | 
 57 | Working with chemical formulas
 58 | ------------------------------
 59 | 
 60 | Chemical formulas can be created with the :class:`~tidyms.chem.Formula` object:
 61 | 
 62 | .. code-block:: python
 63 | 
 64 |     >>> water = ms.chem.Formula("H2O")
 65 |     >>> water
 66 |     Formula(H2O)
 67 | 
 68 | Formula objects can be used to compute a formula mass and its isotopic envelope:
 69 | 
 70 | .. code-block:: python
 71 | 
 72 |     >>> water.get_exact_mass()
 73 |     18.010564684
 74 |     >>> M, p = water.get_isotopic_envelope()
 75 |     >>> M
 76 |     array([18.01056468, 19.01555724, 20.01481138, 21.02108788])
 77 |     >>> p
 78 |     array([9.97340572e-01, 6.09327319e-04, 2.04962911e-03, 4.71450803e-07]))
 79 | 
 80 | Formulas can be created by passing a dictionary of element or isotopes to a
 81 | formula coefficient and the numerical charge of the formula. Formulas are
 82 | implemented as dictionaries of isotopes to formula coefficients, so if an
 83 | element is passed, it is assumed that it is the most abundant isotope.
 84 | 
 85 | .. code-block:: python
 86 | 
 87 |     >>> f = ms.chem.Formula({"C": 1, "13C": 1, "O": 4}, 0)
 88 |     >>> f
 89 |     Formula(C(13C)O4)
 90 | 
 91 | Isotopes can also be specified in the string format:
 92 | 
 93 | .. code-block:: python
 94 | 
 95 |     >>> f = ms.chem.Formula("[C(13C)2H2O4]2-")
 96 |     Formula([C(13C)2H2O4]2-)
 97 |     >>> f.charge
 98 |     -2
 99 | 
100 | 
101 | Sum formula generation
102 | ----------------------
103 | 
104 | The :class:`~tidyms.chem.FormulaGenerator` generates sum formulas from a mass
105 | value. To generate formulas, the space of formula must be defined by using
106 | and passed to the formula generator constructor:
107 | 
108 | .. code-block:: python
109 | 
110 |     >>> bounds = {"C": (0, 20), "H": (0, 40), "O": (0, 10), "N": (0, 5)}
111 |     >>> formula_generator = ms.chem.FormulaGenerator(bounds)
112 | 
113 | To generate formulas, an exact mass value must be passed, along with a tolerance
114 | to find compatible formulas.
115 | 
116 | .. code-block:: python
117 | 
118 |     >>> f = ms.chem.Formula("C5H10O2")
119 |     >>> M = f.get_exact_mass()  # Mass value to generate formulas
120 |     >>> tolerance = 0.005
121 |     >>> formula_generator.generate_formulas(M, tolerance)
122 |     >>> coefficients, isotopes, M_coeff = formula_generator.results_to_array()
123 |     >>> coefficients
124 |     array([[ 0, 10,  2,  4],
125 |            [ 3,  8,  3,  1],
126 |            [ 5, 10,  0,  2]])
127 |     >>> isotopes
128 |     [Isotope(12C), Isotope(1H), Isotope(14N), Isotope(16O)]
129 | 
130 | Coefficients is a 2D Numpy array where each row are coefficients of valid
131 | formulas and each column is an isotope.
132 | 
133 | Formula generator objects can be created easily by using the static method
134 | :meth:`~tidyms.chem.FormulaGenerator.from_hmdb`, which generates reasonable
135 | coefficients spaces for the CHNOPS elements by finding the maximum coefficients
136 | in compounds from the `Human Metabolome DataBase <https://hmdb.ca>`_:
137 | 
138 | .. code-block:: python
139 | 
140 |     m = 1000
141 |     formula_generator = ms.chem.FormulaGenerator.from_hmdb(m)
142 | 
143 | ``m`` defines the maximum mass of the compounds included to create the coefficient
144 | space. ``m`` can take values of 500, 1000, 1500 and 2000. Other element can be
145 | added as follows =
146 | 
147 | .. code-block:: python
148 | 
149 |     m = 1000
150 |     bounds = {"Cl": (0, 2)
151 |     formula_generator = ms.chem.FormulaGenerator.from_hmdb(m, bounds=bounds)
152 | 
153 | 
154 | Scoring Isotopic envelopes
155 | --------------------------
156 | 
157 | Scoring measured envelopes against theoretical values is a common strategy
158 | to establish a formula candidate for an unknown compound. The
159 | :class:`~tidyms.chem.EnvelopeScorer` uses the formulas generated by a formula
160 | generator and scores them using a measure of similarity between the measured and
161 | theoretical envelopes:
162 | 
163 | .. code-block:: python
164 | 
165 |     >>> bounds = {"C": (0, 20), "H": (0, 40), "O": (0, 10), "N": (0, 5)}
166 |     >>> fg = ms.chem.FormulaGenerator(bounds)
167 |     >>> envelope_scorer = ms.chem.EnvelopeScorer(fg, scorer="qtof", max_length=10)
168 | 
169 | The `max_length` parameter sets the maximum length of the measured envelopes to
170 | compare against theoretical values. The `scorer` parameter can be ``qtof``,
171 | ``orbitrap`` or a callable that implements a custom scorer. In the first two
172 | cases, default parameters are set for values measured in Q-TOF or Orbitrap
173 | instruments. The score method takes a list of exact mass and abundances of an
174 | envelope and scores against all compatible formulas. See the API for a detailed
175 | description on how to customize the scorer function. The results can be obtained
176 | with the :meth:`tidyms.chem.EnvelopeScorer.get_top_results` method:
177 | 
178 | .. code-block:: python
179 | 
180 |     >>> import numpy as np
181 |     >>> f = ms.chem.Formula("C5H10O2")
182 |     >>> M, p = f.get_isotopic_envelope(4)  # Get first four peaks from the envelope
183 |     >>> tolerance = 0.005
184 |     >>> envelope_scorer.score(M, p, tolerance)
185 |     >>> coefficients, isotopes, score = envelope_scorer.get_top_results()
186 |     >>> coefficients[np.argmax(score)]
187 |     array([ 5, 10,  0,  2])
188 | 
189 | 
190 | 
191 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath(os.path.pardir))
16 | sys.path.insert(0, os.path.abspath(os.getcwd())) 
17 | from bokeh_plots import create_plots
18 | 
19 | # -- Project information -----------------------------------------------------
20 | 
21 | project = 'TidyMS'
22 | copyright = "2020, Bioanalytical Mass Spectrometry Group at CIBION-CONICET"
23 | author = 'Gabriel Riquelme'
24 | 
25 | # -- generate plot files -----------------------------------------------------
26 | if not os.path.isdir("_static"):
27 |     os.mkdir("_static")
28 | 
29 | if not os.path.isdir("_build"):
30 |     os.mkdir("_build")
31 | 
32 | create_plots()
33 | 
34 | # -- General configuration ---------------------------------------------------
35 | 
36 | # Add any Sphinx extension module names here, as strings. They can be
37 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
38 | # ones.
39 | extensions = [
40 |     'sphinx.ext.autodoc',
41 |     'sphinx.ext.mathjax',
42 |     'sphinx.ext.autosummary',
43 |     'sphinx.ext.intersphinx',
44 |     'IPython.sphinxext.ipython_directive',
45 |     'IPython.sphinxext.ipython_console_highlighting',
46 |     'bokeh.sphinxext.bokeh_plot',
47 |     'matplotlib.sphinxext.plot_directive',
48 |     'numpydoc'
49 | ]
50 | 
51 | add_module_names = False
52 | # Generate the API documentation when building
53 | autosummary_generate = True
54 | numpydoc_show_class_members = False
55 | 
56 | # Add any paths that contain templates here, relative to this directory.
57 | templates_path = ['_templates']
58 | 
59 | # List of patterns, relative to source directory, that match files and
60 | # directories to ignore when looking for source files.
61 | # This pattern also affects html_static_path and html_extra_path.
62 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
63 | 
64 | 
65 | # -- Options for HTML output -------------------------------------------------
66 | 
67 | # The theme to use for HTML and HTML Help pages.  See the documentation for
68 | # a list of builtin themes.
69 | #
70 | html_theme = 'sphinx_rtd_theme'
71 | 
72 | # Add any paths that contain custom static files (such as style sheets) here,
73 | # relative to this directory. They are copied after the builtin static files,
74 | # so a file named "default.css" will overwrite the builtin "default.css".
75 | html_static_path = ['_static']
76 | 
77 | intersphinx_mapping = {
78 |     'pandas': ('https://pandas.pydata.org/docs/', None),
79 |     'scipy': ('https://docs.scipy.org/doc/scipy/', None),
80 | }
81 | 
82 | # set index.rst as the master doc
83 | master_doc = 'index'
84 | 
85 | # include __init__ in docs
86 | autoclass_content = 'both'


--------------------------------------------------------------------------------
/docs/descriptors.csv:
--------------------------------------------------------------------------------
1 | Descriptor,Meaning
2 | height,height relative to the baseline
3 | area,area minus the baseline area
4 | rt,weighted average of the retention time in the peak region
5 | mz,weighted average of the m/z in the peak region
6 | width,"width, computed as the region where the 95 % of the peak area is distributed"
7 | snr,"peak signal-to-noise ratio, defined as the quotient between the peak height and the noise level"
8 | mz std,standard deviation of the m/z in the peak region


--------------------------------------------------------------------------------
/docs/examples/custom_peak_descriptors.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from tidyms.peaks import detect_peaks
 3 | from tidyms.peaks import get_peak_descriptors
 4 | from tidyms.utils import gaussian_mixture
 5 | 
 6 | # always generate the same plot
 7 | np.random.seed(1234)
 8 | 
 9 | # create a signal with two gaussian peaks
10 | x = np.arange(100)
11 | gaussian_params = np.array([[25, 3, 30], [50, 2, 60]])
12 | y = gaussian_mixture(x, gaussian_params).sum(axis=0)
13 | # add a noise term
14 | y += np.random.normal(size=y.size, scale=1)
15 | 
16 | # detect_peaks also returns the noise and baseline estimation used
17 | peaks, noise, baseline = detect_peaks(y)


--------------------------------------------------------------------------------
/docs/examples/defined_spots_supervised.tsv:
--------------------------------------------------------------------------------
 1 | msData_ID	spotInd	include	name	group	class	batch	startRT_seconds	endRT_seconds	comment
 2 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING	0	False	Spot_0	unknown	unknown	1	10.15000016	308.5039902	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
 3 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING	1	True	Airblank_1	Airblank	unknown	1	326.7699909	367.3630142	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
 4 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING	2	True	Airblank_2	Airblank	unknown	1	376.4960003	419.1180038	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
 5 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING	3	True	Airblank_3	Airblank	unknown	1	428.2509899	469.8579884	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
 6 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING	4	True	Airblank_4	Airblank	unknown	1	478.9920044	521.6139793	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
 7 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING	5	True	Airblank_5	Airblank	unknown	1	529.7320175	572.3539925	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
 8 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING	6	True	Airblank_6	Airblank	unknown	1	581.4870071	623.0949783	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
 9 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING	7	True	Airblank_7	Airblank	unknown	1	632.227993	674.8500252	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
10 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING	8	True	Airblank_8	Airblank	unknown	1	683.9829826	725.5899811	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
11 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING	9	True	Airblank_9	Airblank	unknown	1	734.7229958	776.3310242	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
12 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING	10	True	Airblank_10	Airblank	unknown	1	785.4639816	828.0860138	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
13 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING	11	True	Airblank_11	Airblank	unknown	1	837.2190285	878.8260269	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
14 | 20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING	12	True	Airblank_12	Airblank	unknown	1	887.9600143	923.4780121	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-AIRBLANK-AFTERCLEANING', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
15 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY	0	False	Spot_0	Inj2ul	unknown	2	37.54999995	229.3489981	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
16 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY	1	True	Inj10ul_1	Inj10ul	unknown	2	247.6150131	288.2070065	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
17 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY	2	True	Inj10ul_2	Inj10ul	unknown	2	297.3409939	339.9629974	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
18 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY	3	True	Inj10ul_3	Inj10ul	unknown	2	348.081007	390.7030106	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
19 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY	4	True	Inj10ul_4	Inj10ul	unknown	2	399.8359966	441.4439964	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
20 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY	5	True	Inj10ul_5	Inj10ul	unknown	2	450.5770111	493.1989861	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
21 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY	6	True	Inj10ul_6	Inj10ul	unknown	2	501.3170242	543.9389992	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
22 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY	7	True	Inj2ul_1	Inj2ul	unknown	2	553.0729866	595.6950188	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
23 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY	8	True	Inj2ul_2	Inj2ul	unknown	2	603.8129997	646.4349747	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
24 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY	9	True	Inj2ul_3	Inj2ul	unknown	2	655.5690193	698.1909943	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
25 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY	10	True	Inj2ul_4	Inj2ul	unknown	2	706.3089752	748.9310074	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
26 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY	11	True	Inj2ul_5	Inj2ul	unknown	2	757.0500183	799.6719933	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
27 | 20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY	12	True	Inj2ul_6	Inj2ul	unknown	2	808.8050079	844.3230057	spot automatically extracted by _get_separate_chronogram_indices(msData, '20221222-HE-350V-400C-POS-MESH-N002-REPEATABILITY', intensityThreshold = 50000.000000, startTime_seconds = 0.000000, endTime_seconds = 1000000.000000)
28 | 


--------------------------------------------------------------------------------
/docs/examples/roi-creation.py:
--------------------------------------------------------------------------------
 1 | from ftplib import FTP
 2 | import tidyms as ms
 3 | import os
 4 | 
 5 | # this code downloads an example file from Metabolights via ftp
 6 | study_path = "pub/databases/metabolights/studies/public/MTBLS1919"
 7 | sample_path = os.path.join(study_path, "Applications/Centroid_data")
 8 | filename = "NZ_20200227_041.mzML"
 9 | ftp = FTP("ftp.ebi.ac.uk")
10 | ftp.login()
11 | ftp.cwd(sample_path)
12 | with open(filename, "wb") as fin:
13 |     ftp.retrbinary("RETR " + filename, fin.write)
14 | ftp.close()
15 | 
16 | # specifying instrument and separation used in the experiments provides better
17 | # default values for several functions used in
18 | ms_data = ms.MSData.create_MSData_instance(
19 |     filename, 
20 |     ms_mode="centroid", 
21 |     instrument="qtof",
22 |     separation="uplc"
23 |     )
24 | roi_list = ms_data.make_roi()
25 | 


--------------------------------------------------------------------------------
/docs/fileio.rst:
--------------------------------------------------------------------------------
  1 | .. _working-with-raw-data:
  2 | 
  3 | .. py:currentmodule:: tidyms
  4 | 
  5 | :orphan:
  6 | 
  7 | Working with raw data
  8 | =====================
  9 | 
 10 | TidyMS works with raw data in the mzML format using the :class:`~tidyms.MSData`
 11 | class. In this section we show commons operations on raw data. For file
 12 | conversion to the mzML format see :ref:`this guide <mzml>`
 13 | 
 14 | For the examples we will use an example mzML file that can be downloaded with
 15 | the following code:
 16 | 
 17 | .. code-block:: python
 18 | 
 19 |     import numpy as np
 20 |     import tidyms as ms
 21 | 
 22 |     filename = "NZ_20200227_039.mzML"
 23 |     dataset = "test-nist-raw-data"
 24 |     ms.fileio.download_tidyms_data(dataset, [filename], download_dir=".")
 25 | 
 26 | 
 27 | Raw data
 28 | --------
 29 | 
 30 | Raw MS data in the mzML format can be read through the :class:`~tidyms.MSData`
 31 | object.
 32 | 
 33 | .. code-block:: python
 34 | 
 35 |     ms_data = ms.MSData.create_MSData_instance(
 36 |         filename,
 37 |         ms_mode="centroid",
 38 |         instrument="qtof",
 39 |         separation="uplc"
 40 |     )
 41 | 
 42 | It is necessary to specify if the data is in centroid or profile mode using the
 43 | :code:`ms_mode` parameter, as some methods work in different ways for each
 44 | type of data. Specifying the :code:`instrument` and :code:`separation` is also
 45 | recommended, as these parameters set reasonable defaults in different functions
 46 | used.
 47 | 
 48 | :class:`~tidyms.MSData` is optimized for low memory usage and only loads the
 49 | required data into memory. A single MS spectrum can be loaded using
 50 | :meth:`~tidyms.MSData.get_spectrum` which returns a
 51 | :class:`~tidyms.lcms.MSSpectrum`.
 52 | 
 53 | .. code-block:: python
 54 | 
 55 |     index = 20
 56 |     sp = ms_data.get_spectrum(index)
 57 | 
 58 | The index used is the order in which the data was stored in the file. In the
 59 | same way, a stored chromatogram can be retrieved using
 60 | :meth:`~tidyms.MSData.get_chromatogram`. The total count of spectra and
 61 | chromatograms in the file can be obtained using
 62 | :meth:`tidyms.MSData.get_n_spectra` and
 63 | :meth:`tidyms.MSData.get_n_chromatograms` respectively. Iterating over all
 64 | the spectra in a file can be done using
 65 | :meth:`~tidyms.MSData.get_spectra_iterator`, which generates each one of the
 66 | spectra in the file and allows filtering by acquisition time or MS level.
 67 | Common operations with raw data are located in :mod:`tidyms.raw_data_utils`.
 68 | 
 69 | 
 70 | Working with Mass Spectra
 71 | -------------------------
 72 | 
 73 | :class:`~tidyms.MSSpectrum` stores the information from one scan. It is mostly
 74 | used as a data storage class in several data processing steps, but it also has
 75 | functionality to visualize the spectrum using the
 76 | :meth:`~tidyms.MSSpectrum.plot` method and to convert a profile data spectrum
 77 | into centroid mode using :meth:`tidyms.MSSpectrum.find_centroids`.
 78 | 
 79 | :func:`tidyms.raw_data_utils.accumulate_spectra` combines a series of scans in
 80 | a file into a single spectrum:
 81 | 
 82 | .. code-block:: python
 83 | 
 84 |     combined_sp = ms.accumulate_spectra(ms_data, start_time=110, end_time=115)
 85 | 
 86 | Chromatograms
 87 | -------------
 88 | 
 89 | Besides the chromatograms stored in a file, extracted chromatograms can be
 90 | created :func:`tidyms.raw_data_utils.make_chromatograms` which takes an array of
 91 | m/z and returns a list :class:`tidyms.Chromatogram` objects, each one associated
 92 | to one of the m/z values provided:
 93 | 
 94 | .. code-block:: python
 95 | 
 96 |     mz_list = np.array([189.0734, 205.0967, 188.071])
 97 |     chromatograms = ms.make_chromatograms(ms_data, mz_list)
 98 | 
 99 | A chromatogram can be visualized using ``plot`` method:
100 | 
101 | .. code-block:: python
102 | 
103 |     chrom = chromatograms[0]
104 |     chrom.plot()
105 | 
106 | .. raw:: html
107 | 
108 |     <iframe src="_static/chromatogram.html" height="450px" width="700px" style="border:none;"></iframe>
109 | 
110 | Peaks in a chromatogram are detected using
111 | :meth:`tidyms.lcms.LCRoi.extract_features`, which stores a list of
112 | :class:`tidyms.lcms.Peak` objects in the `features` attribute of the
113 | chromatogram. Plotting again the chromatogram shows the detected peaks:
114 | 
115 | .. code-block:: python
116 | 
117 |     chrom.extract_features()
118 |     chrom.plot()
119 | 
120 | .. raw:: html
121 | 
122 |     <iframe src="_static/chromatogram-with-peaks.html" height="450px" width="700px" style="border:none;"></iframe>
123 | 
124 | Peak descriptors can be obtained using
125 | :meth:`tidyms.lcms.Roi.describe_features`:
126 | 
127 | .. code-block:: python
128 | 
129 |     >>> chrom.describe_features()
130 |     [{'height': 16572.38, 'area': 108529.94, 'rt': 125.73, 'width': 14.06,
131 |       'snr': 385.44, 'mz': None, 'mz_std': None}]
132 | 
133 | A detailed description of the algorithm used for peak picking can be found
134 | :ref:`here <peak-picking>`. These methods are also used to create a data matrix from
135 | a dataset. See :ref:`here <processing-datasets>` a tutorial on how to work with
136 | complete datasets to extract a data matrix.
137 | 


--------------------------------------------------------------------------------
/docs/fileio_tutorial.rst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/griquelme/tidyms/ad9356a099f367076f745406be23bb4c50003239/docs/fileio_tutorial.rst


--------------------------------------------------------------------------------
/docs/glossary.rst:
--------------------------------------------------------------------------------
  1 | .. definitions
  2 | 
  3 | Definitions
  4 | ===========
  5 | 
  6 | Here is a list of the concepts used in TidyMS.
  7 | 
  8 | .. glossary::
  9 | 
 10 |     batch correction
 11 |         A correction step applied to reduce the time dependent variation in the
 12 |         metabolite signals due to instrumental response changes, carryover,
 13 |         or metabolite degradation, among others.
 14 | 
 15 |     blank correction
 16 |         A correction applied on study samples to remove the contribution to
 17 |         the signal coming from sample preparation. This process consist in
 18 |         measuring a set of blank samples and using them to estimate the
 19 |         sample preparation contribution to the signal.
 20 | 
 21 |     carryover
 22 |         A measurement artifact in LC-MS. Occurs when signals from one sample are
 23 |         detected in the next sample (signals are “carried over”).
 24 | 
 25 |     correction
 26 |         A data curation step where the data matrix is transformed to correct
 27 |         the data.
 28 | 
 29 |     data curation
 30 |         The process of reducing the bias introduced in the measurements during
 31 |         sample preparation and data acquisition. Also, the filtration of samples
 32 |         that cannot be measured in an analytically robust way.
 33 | 
 34 |     data matrix
 35 |         A matrix of feature values where each row is a sample or observation and
 36 |         each column is a feature.
 37 | 
 38 |     feature
 39 |         A measurable property of a phenomenon being observed. In LC-MS a feature
 40 |         is usually represented as a chromatographic peak.
 41 | 
 42 |     feature correspondence
 43 |         The process of match features extracted in different samples.
 44 | 
 45 |     feature descriptor
 46 |         A series of characteristics of a feature. In the case of a
 47 |         chromatographic peak, feature descriptors can be peak area, retention
 48 |         time, mean m/z among, others.
 49 | 
 50 |     feature detection
 51 |         The process of finding a feature in a data set. Once a feature is
 52 |         detected it can be extracted into a feature descriptor. In LC-MS the
 53 |         feature detection procedure involves the detection of chromatographic
 54 |         peaks and extraction into rt, m/z and area information.
 55 | 
 56 |     feature table
 57 |         The table obtained after feature extraction, where each row is a
 58 |         feature detected in a sample and each column is a descriptor.
 59 | 
 60 |     filtration
 61 |         A data curation step where samples or features are removed according
 62 |         to an specific criteria.
 63 | 
 64 |     mapping
 65 |         A dictionary that maps the sample type to sample classes The available
 66 |         sample types are: study sample, quality control, blank, system
 67 |         suitability.
 68 | 
 69 |     normalization
 70 |         An operation on the data matrix to adjust the sample values. Common
 71 |         normalization methods use different norms, such as the euclidean
 72 |         norm, Manhattan norm or maximum norm.
 73 | 
 74 |     prevalence filter
 75 |         A filter applied on a data matrix to remove features that are detected
 76 |         in a low number of samples.
 77 | 
 78 |     quality control sample
 79 |         Samples applied to demonstrate analytical accuracy, precision, and
 80 |         repeatability after data processing and can be converted to metrics
 81 |         describing data quality.
 82 | 
 83 |     run order
 84 |         Temporal order in which the different samples were analyzed.
 85 | 
 86 |     sample class
 87 |         The category of the sample. Can be related to the study (e.g: healthy,
 88 |         disease) or to the experiment design (quality control, blank, etc...).
 89 | 
 90 |     sample descriptor
 91 |         A characteristic of a sample. Can be the sample type, class, run order,
 92 |         analytical batch.
 93 | 
 94 |     sample type
 95 |         The type of sample used in the experiment. Sample types can be: study
 96 |         sample, quality control, blank, system suitability.
 97 | 
 98 |     scaling
 99 |         An operation on the data matrix to change the distribution of features.
100 | 
101 |     system suitability check
102 |         The analysis of a series of samples to assess the performance of an
103 |         analytical platform.
104 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. TidyMS documentation master file, created by
 2 |    sphinx-quickstart on Tue May 19 15:53:07 2020.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | TidyMS
 7 | ======
 8 | 
 9 | TidyMS is a python package that provides easy to use tools for processing and
10 | analyzing mass spectrometry based metabolomics data sets. It's built on top
11 | of Numpy, Pandas and scikit-learn. Get started by reading the
12 | :doc:`installation` instructions and then see an overview of the package in the
13 | :doc:`quickstart`. You can also see some applications in the example gallery. For
14 | detailed information about tidyms, you can see the :doc:`api` reference.
15 | 
16 | .. toctree::
17 |    :maxdepth: 2
18 |    :caption: Contents:
19 | 
20 |    Glossary <glossary>
21 |    Installation guide <installation>
22 |    Quickstart <quickstart>
23 |    Tutorials <tutorials>
24 |    API Reference <api>
25 | 
26 | Indices and tables
27 | ==================
28 | 
29 | * :ref:`genindex`
30 | * :ref:`modindex`
31 | * :ref:`search`
32 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | .. installation
 2 | 
 3 | Installation
 4 | ============
 5 | 
 6 | Python version
 7 | --------------
 8 | 
 9 | We recommend to use the last version of Python 3.  If you don't have Python
10 | installed we recommend installing it using the `Anaconda`_ distribution .
11 | 
12 | .. _Anaconda: https://docs.anaconda.com/anaconda/install/
13 | 
14 | 
15 | Install TidyMS
16 | --------------
17 | 
18 | If you already have Python, you can install TidyMS from the Python Package
19 | Index:
20 | 
21 | On Linux:
22 | 
23 | .. code-block:: sh
24 | 
25 |     $ pip install tidyms
26 | 
27 | On Windows, if you are using Anaconda and didn't add Python to the PATH
28 | environment variable you have to run this command from the conda prompt.
29 | 
30 | 


--------------------------------------------------------------------------------
/docs/mzml.rst:
--------------------------------------------------------------------------------
 1 | .. _mzml:
 2 | 
 3 | .. py:currentmodule:: tidyms
 4 | 
 5 | Converting raw data to mzML format
 6 | ==================================
 7 | 
 8 | We recommend using `msconvert
 9 | <http://proteowizard.sourceforge.net/download.html>`_ to convert raw data
10 | generated from the different instruments to mzML format. Files can be converted
11 | from a GUI or from the command line. To convert all the files with names ending
12 | in :code:`.RAW` inside a directory from the command line the following command
13 | can be used:
14 | 
15 | .. code-block:: bat
16 | 
17 |     msconvert *.RAW -o my_output_dir
18 | 
19 | If you are using a Waters instrument with lockspray correction, the
20 | :code:`scanEvent` filter can be used to remove the signal from the lockspray.
21 | 
22 | .. code-block:: bat
23 | 
24 |     msconvert *.RAW --filter "msEvent 1" -o my_output_dir
25 | 
26 | To perform feature detection, data must be provided in centroid format. This
27 | can be done using the :code:`peakPicking` filter option:
28 | 
29 | .. code-block:: bat
30 | 
31 |     msconvert data.RAW --filter "peakPicking cwt snr=1 peakSpace=0.01"
32 | 
33 | A :code:`snr=1` is recommended as noisy peaks will be removed during feature
34 | detection anyway. :code:`peakSpacing` should be chosen according to the
35 | instrument used. For QTOF instruments a value of 0.01 is recommended, but
36 | for higher resolution instruments, such as orbitrap or FT-ICR, lower values
37 | may be used.
38 | 


--------------------------------------------------------------------------------
/docs/plots/dbscan-clustering.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tidyms as ms
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | np.random.seed(1234)
 6 | n = 200
 7 | X1 = np.random.normal(size=(n, 2))
 8 | samples = np.hstack((np.arange(n), np.arange(n)))
 9 | X2 = np.random.normal(size=(n, 2), loc=(2, 2))
10 | X = np.vstack((X1, X2))
11 | 
12 | dbscan_labels = ms.correspondence._cluster_dbscan(X, 2.0, 50, 10000)
13 | gmm_labels, score = ms.correspondence._process_cluster(X, samples, 2, 3.0)
14 | 
15 | fig, ax = plt.subplots()
16 | for l in np.unique(dbscan_labels):
17 |     ax.scatter(*X[dbscan_labels == l].T, label=l)
18 | 
19 | ax.set_xlabel("m/z")
20 | ax.set_ylabel("Rt")
21 | ax.legend(title="DBSCAN labels")


--------------------------------------------------------------------------------
/docs/plots/dbscan-parameters.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from sklearn.cluster import DBSCAN
 4 | import seaborn as sns
 5 | from itertools import product
 6 | 
 7 | sns.set_context("paper", font_scale=1.25)
 8 | 
 9 | 
10 | sample_size = [10, 20, 50, 100, 200, 500]
11 | fractions = [0.1, 0.25, 0.5, 0.75, 1.0]
12 | eps = [0.5, 1.0, 2.0, 3.0, 4.0]
13 | n_reps = 5
14 | results = list()
15 | 
16 | for k_rep, size, f, e in product(range(n_reps), sample_size, fractions, eps):
17 |     X = np.random.normal(size=(size, 2))
18 |     min_samples = round(size * f)
19 |     dbscan = DBSCAN(eps=e, min_samples=min_samples, metric="chebyshev")
20 |     dbscan.fit(X)
21 |     cluster = dbscan.labels_
22 |     noise_fraction = (cluster == -1).sum() / size
23 |     results.append([k_rep, size, f, e, noise_fraction])
24 | df_normal = pd.DataFrame(
25 |     data=results,
26 |     columns=["rep", "sample size", "sample fraction", "eps", "noise fraction"]
27 | )
28 | 
29 | sns.catplot(
30 |     data=df_normal,
31 |     x="eps",
32 |     y="noise fraction",
33 |     palette="Set1",
34 |     col="sample size",
35 |     hue="sample fraction",
36 |     legend="full",
37 |     col_wrap=2,
38 |     s=8
39 | )


--------------------------------------------------------------------------------
/docs/plots/gmm-clustering.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tidyms as ms
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | np.random.seed(1234)
 6 | n = 200
 7 | X1 = np.random.normal(size=(n, 2))
 8 | samples = np.hstack((np.arange(n), np.arange(n)))
 9 | X2 = np.random.normal(size=(n, 2), loc=(2, 2))
10 | X = np.vstack((X1, X2))
11 | 
12 | dbscan_labels = ms.correspondence._cluster_dbscan(X, 2.0, 50, 10000)
13 | gmm_labels, score = ms.correspondence._process_cluster(X, samples, 2, 3.0)
14 | 
15 | fig, ax = plt.subplots()
16 | for l in np.unique(gmm_labels):
17 |     ax.scatter(*X[gmm_labels == l].T, label=l)
18 | 
19 | ax.set_xlabel("m/z")
20 | ax.set_ylabel("Rt")
21 | ax.legend(title="GMM labels")


--------------------------------------------------------------------------------
/docs/plots/peak-definition.py:
--------------------------------------------------------------------------------
 1 | import tidyms as ms
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # always generate the same plot
 6 | np.random.seed(1234)
 7 | 
 8 | grid = np.arange(50)
 9 | signal = ms.utils.gauss(grid, 25, 2, 30)
10 | noise = np.random.normal(size=signal.size, scale=1)
11 | x = signal + noise + 3
12 | peak = ms.lcms.Peak(19, 25, 30)
13 | fig, ax = plt.subplots(figsize=(6, 6))
14 | ax.plot(grid, x, label="signal")
15 | ax.scatter(grid[peak.start], x[peak.start], label="peak start", s=50)
16 | ax.scatter(grid[peak.apex], x[peak.apex], label="peak apex", s=50)
17 | ax.scatter(grid[peak.end], x[peak.end], label="peak end", s=50)
18 | ax.fill_between(grid[peak.start:peak.end + 1],
19 |                 x[peak.start:peak.end + 1], alpha=0.2, label="peak region")
20 | ax.annotate(text='', xy=(grid[peak.end + 5], x[peak.end]),
21 |             xytext=(grid[peak.end + 5], x[peak.apex]),
22 |             arrowprops=dict(arrowstyle='<->'))
23 | ax.annotate(text='peak \n prominence',
24 |             xy=(grid[peak.end + 10],x[peak.apex] / 2))
25 | ax.legend()
26 | 


--------------------------------------------------------------------------------
/docs/plots/peak-detection-example.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from tidyms import peaks
 4 | from tidyms.lcms import Peak
 5 | from tidyms.utils import gaussian_mixture
 6 | 
 7 | # always generate the same plot
 8 | np.random.seed(1234)
 9 | 
10 | # create a signal with two gaussian peaks
11 | x = np.arange(100)
12 | gaussian_params = np.array([[25, 3, 30], [50, 2, 60]])
13 | y = gaussian_mixture(x, gaussian_params).sum(axis=0)
14 | # add a noise term
15 | y += np.random.normal(size=y.size, scale=0.5)
16 | 
17 | noise_estimation = peaks.estimate_noise(y)
18 | baseline_estimation = peaks.estimate_baseline(y, noise_estimation)
19 | start, apex, end = peaks.detect_peaks(y, noise_estimation, baseline_estimation)
20 | peaks = [Peak(s, a, p) for s, a, p in zip(start, apex, end)]
21 | fig, ax = plt.subplots()
22 | ax.plot(x, y)
23 | for p in peaks:
24 |     ax.fill_between(x[p.start:p.end], y[p.start:p.end], alpha=0.25)
25 | 


--------------------------------------------------------------------------------
/docs/plots/peak_detection_baseline_example.py:
--------------------------------------------------------------------------------
 1 | import tidyms as ms
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | np.random.seed(1234)
 6 | signal_height = 100
 7 | snr = 10
 8 | n_col = 4
 9 | x = np.arange(200)
10 | noise_level = signal_height / snr
11 | noise = np.random.normal(size=x.size, scale=noise_level)
12 | fig, ax = plt.subplots(
13 |     nrows=3, ncols=n_col, figsize=(12, 12), sharex=True, sharey=True)
14 | 
15 | # first row: one peak, different baselines
16 | row = 0
17 | baselines = [4, ms.utils.gauss(x, 100, 40, 20), x ** 2 * 0.002,
18 |              np.sin(x * np.pi / 400) * 50]
19 | for col in range(n_col):
20 |     signal = ms.utils.gauss(x, 100, 3, signal_height)
21 |     y = signal + noise
22 |     noise_estimation = ms.peaks.estimate_noise(y)
23 |     ys = ms.lcms.gaussian_filter1d(y, 1)
24 |     baseline_estimation = ms.peaks.estimate_baseline(ys, noise_estimation)
25 |     start, apex, end = ms.peaks.detect_peaks(
26 |         ys, noise_estimation, baseline_estimation)
27 |     peaks = [ms.lcms.Peak(s, a, p) for (s, a, p) in zip(start, apex, end)]
28 |     ax[row, col].plot(x, y)
29 |     ax[row, col].plot(x, baseline_estimation)
30 |     for p in peaks:
31 |         ax[row, col].fill_between(x[p.start:p.end + 1],
32 |                                   baseline_estimation[p.start:p.end + 1],
33 |                                   y[p.start:p.end + 1], alpha=0.25)
34 | 
35 | # second row: two peaks, same baselines as first row
36 | row = 1
37 | for col in range(n_col):
38 |     gaussian_params = np.array([[100, 3, signal_height],
39 |                                 [110, 3, signal_height]])
40 |     signal = ms.utils.gaussian_mixture(x, gaussian_params).sum(axis=0)
41 |     y = signal + baselines[col] + noise
42 |     noise_estimation = ms.peaks.estimate_noise(y)
43 |     ys = ms.lcms.gaussian_filter1d(y, 1)
44 |     baseline_estimation = ms.peaks.estimate_baseline(ys, noise_estimation)
45 |     start, apex, end = ms.peaks.detect_peaks(
46 |         ys, noise_estimation, baseline_estimation)
47 |     peaks = [ms.lcms.Peak(s, a, p) for (s, a, p) in zip(start, apex, end)]
48 |     ax[row, col].plot(x, y)
49 |     ax[row, col].plot(x, baseline_estimation)
50 |     for p in peaks:
51 |         ax[row, col].fill_between(x[p.start:p.end + 1],
52 |                                   baseline_estimation[p.start:p.end + 1],
53 |                                   y[p.start:p.end + 1], alpha=0.25)
54 | 
55 | # third row: different peak widths:
56 | row = 2
57 | widths = [3, 5, 7, 10]
58 | for col in range(n_col):
59 |     w = widths[col]
60 |     signal = ms.utils.gauss(x, 100, w, signal_height)
61 |     y = signal + baselines[0] + noise
62 |     noise_estimation = ms.peaks.estimate_noise(y)
63 |     ys = ms.lcms.gaussian_filter1d(y, 1)
64 |     baseline_estimation = ms.peaks.estimate_baseline(ys, noise_estimation)
65 |     start, apex, end = ms.peaks.detect_peaks(
66 |         ys, noise_estimation, baseline_estimation)
67 |     peaks = [ms.lcms.Peak(s, a, p) for (s, a, p) in zip(start, apex, end)]
68 |     ax[row, col].plot(x, y)
69 |     ax[row, col].plot(x, baseline_estimation)
70 |     for p in peaks:
71 |         ax[row, col].fill_between(x[p.start:p.end + 1],
72 |                                   baseline_estimation[p.start:p.end + 1],
73 |                                   y[p.start:p.end + 1], alpha=0.25)
74 | 


--------------------------------------------------------------------------------
/docs/plots/roi-definition.py:
--------------------------------------------------------------------------------
 1 | import tidyms as ms
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # always generate the same plot
 6 | np.random.seed(1234)
 7 | grid = np.arange(50)
 8 | signal = ms.utils.gauss(grid, 25, 2, 30)
 9 | noise = np.random.normal(size=signal.size, scale=1)
10 | x = signal + noise + 3
11 | mz_mean = 203.08215
12 | mz = np.random.normal(size=signal.size, scale=0.0005) + mz_mean
13 | 
14 | fig, ax = plt.subplots(figsize=(6, 6), nrows=2, sharex=True)
15 | ax[1].plot(grid, x)
16 | ax[1].set_ylabel("Intensity")
17 | ax[1].set_xlabel("Retention Time")
18 | ax[0].plot(grid, mz)
19 | ax[0].set_ylabel("m/z")
20 | ax[0].set_ylim(mz_mean - 0.0025, mz_mean + 0.0025)
21 | 


--------------------------------------------------------------------------------
/docs/preprocessing-steps.csv:
--------------------------------------------------------------------------------
1 | #,name,description
2 | 1,Feature Detection,"Regions of interest (ROI) are detected in each sample."
3 | 2,Feature Extraction,"Features are extracted from each ROI."
4 | 3,Feature description,"A table of feature descriptors is built for each sample."
5 | 4,Feature table construction,"A feature table for all samples is built"
6 | 5,Feature matching,"Features found in different samples are grouped if they have a common identity."
7 | 6,Data matrix creation,"The data matrix is created using the feature table."


--------------------------------------------------------------------------------
/docs/quickstart.rst:
--------------------------------------------------------------------------------
 1 | .. _quickstart:
 2 | 
 3 | .. py:currentmodule:: tidyms
 4 | 
 5 | Quickstart
 6 | ==========
 7 | 
 8 | TidyMS [1]_ is a Python package that provides tools to process and analyze
 9 | Mass Spectrometry (MS) data. Although suited for general use, it was designed
10 | to be used with datasets from LC-HRMS metabolomics experiments. It uses
11 | `Numpy <https://numpy.org/>`_, `Pandas <https://pandas.pydata.org/>`_ and
12 | `scikit-learn <https://scikit-learn.org>`_ for data processing and analysis.
13 | Some of the functionality that offers is:
14 | 
15 | -   read raw data in the mzML format using :class:`tidyms.MSData` class, optimized for speed and low memory usage.
16 | -   Creation of chromatograms and accumulated spectra from raw data.
17 | -   :term:`Feature detection<feature detection>` and :term:`feature correspondence` in metabolomics datasets using the :class:`tidyms.Assay` class.
18 | -   Read processed data from other mass spectrometry processing software (XCMS, mzmine2, etc...).
19 | -   A container object to manage metabolomics data.
20 | -   :term:`Data curation<data curation>` of untargeted metabolomics data sets using widely accepted practices from the metabolomics community [2]_ [3]_
21 | -   Interactive data visualization using `bokeh <https://bokeh.org/>`_, or publication quality plots using `seaborn <https://seaborn.pydata.org/>`_.
22 | 
23 | In the rest of this guide, you can find links for different use cases for the
24 | TidyMS package. A basic knowledge of MS and metabolomics is assumed, but you can
25 | look up in the :doc:`glossary` the concepts used in the guides.
26 | Installation instructions are available :doc:`here<installation>`.
27 | 
28 | You can refer to the following guides to learn about specific topics:
29 | 
30 | -   :ref:`Working with raw data <working-with-raw-data>`
31 | -   :ref:`Processing complete datasets from raw data <processing-datasets>`
32 | -   :ref:`Curation of a metabolomics data matrix <data-curation>`
33 | -   :ref:`Feature detection and extraction algorithms <peak-picking>`
34 | -   :ref:`Feature correspondence algorithm <ft-correspondence>`
35 | -   :ref:`Converting proprietary instrument-specific formats into mzML <mzml>`
36 | 
37 | 
38 | References
39 | ----------
40 | 
41 | ..  [1] Riquelme, G. *et al*, "A Python-Based Pipeline for Preprocessing LC–MS
42 |     Data for Untargeted Metabolomics Workflows". Metabolites 2020, 10, 416.
43 |     https://doi.org/10.3390/metabo10100416
44 |     16, 1, (2015), Pages 104–117, https://doi.org/10.1093/bib/bbt080
45 | ..  [2] W B Dunn *et al*, "Procedures for large-scale metabolic profiling of
46 |     serum and plasma using gas chromatography and liquid chromatography
47 |     coupled to mass spectrometry", Nature Protocols volume 6, pages
48 |     1060–1083 (2011).
49 | ..  [3] D Broadhurst *et al*, "Guidelines and considerations for the use of
50 |     system suitability and quality control samples in mass spectrometry assays
51 |     applied in untargeted clinical metabolomic studies.", Metabolomics,
52 |     2018;14(6):72. doi: 10.1007/s11306-018-1367-3


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | bokeh
2 | numpydoc
3 | sphinx
4 | tidyms
5 | sphinx_rtd_theme


--------------------------------------------------------------------------------
/docs/tutorials.rst:
--------------------------------------------------------------------------------
 1 | .. _tutorials:
 2 | 
 3 | .. py:currentmodule:: tidyms
 4 | 
 5 | Tutorials
 6 | =========
 7 | 
 8 | In this section there is available a list of tutorials on specific topics.
 9 | 
10 | * :ref:`Converting files to mzML <mzml>`
11 | * :ref:`Working with raw data <working-with-raw-data>`
12 | * :ref:`Processing complete datasets <processing-datasets>`
13 | * :ref:`Feature detection <peak-picking>`
14 | * :ref:`Feature correspondence <ft-correspondence>`
15 | * :ref:`Working with chemical formulas <working-with-chemical-formulas>`


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "tidyms"
 7 | version = "0.7.0"
 8 | description = "Tools for working with MS data in metabolomics"
 9 | authors = [
10 |     { name = "Gabriel Riquelme" },
11 | ]
12 | readme = "README.md"
13 | license = {file = "LICENSE"}
14 | classifiers = [
15 |     "Programming Language :: Python :: 3.9",
16 |     "Programming Language :: Python :: 3.10",
17 |     "License :: OSI Approved :: BSD License",
18 |     "Topic :: Scientific/Engineering :: Bio-Informatics",
19 |     "Topic :: Scientific/Engineering :: Chemistry",
20 |     "Topic :: Scientific/Engineering :: Medical Science Apps."
21 | ]
22 | dependencies = [
23 |     "beautifulsoup4>=4.11.2",
24 |     "bokeh>=3.0",
25 |     "Cerberus>=1.3",
26 |     "dill>=0.3.6",
27 |     "ipython>=8.1",
28 |     "joblib>=1.1",
29 |     "matplotlib>=3.5.1",
30 |     "natsort>=8.2.0",
31 |     "networkx>=3.0",
32 |     "numpy>=1.22",
33 |     "openpyxl>=3.0",
34 |     "pandas>=1.5.3",
35 |     "plotnine>=0.10.1",
36 |     "requests",
37 |     "scikit-learn>=1.0.2",
38 |     "scipy>=1.8",
39 |     "seaborn>=0.11",
40 |     "statsmodels>=0.13",
41 |     "tqdm>=4.0",
42 |     "umap-learn>=0.5.3",
43 |     "xlrd>=2.0"
44 | ]
45 | requires-python = ">=3.9"
46 | 
47 | [project.urls]
48 | Homepage = "https://github.com/griquelme/tidyms"
49 | 
50 | [tool.mypy]
51 | python_version = "3.9"
52 | files = ["src/tidyms"]
53 | 
54 | [tool.pytest.ini_options]
55 | pythonpath = [
56 |   ".", "./src"
57 | ]


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | bokeh>=3.0.3
 2 | Cerberus>=1.3
 3 | ipython>=8.1
 4 | joblib>=1.1
 5 | matplotlib>=3.5.1
 6 | networkx>=3.0
 7 | numpy>=1.22
 8 | openpyxl>=3.0
 9 | pandas>=1.5.3
10 | requests
11 | scikit-learn>=1.0.2
12 | scipy>=1.8
13 | seaborn>=0.12
14 | statsmodels>=0.13
15 | tqdm>=4.0
16 | xlrd>=2.0
17 | 
18 | plotnine>=0.10.1
19 | natsort>=8.2.0
20 | beautifulsoup4>=4.11.2
21 | dill>=0.3.6
22 | umap-learn>=0.5.3


--------------------------------------------------------------------------------
/src/tidyms/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | TidyMS
 3 | ======
 4 | 
 5 | A package to work with Mass Spectrometry data from Metabolomics Experiments.
 6 | 
 7 | Provides
 8 |     1. The Assay object to process datasets from raw data.
 9 |     2. The MSData object to work with raw data.
10 |     3. The DataContainer object to store metabolomics data sets.
11 |     4. Pipeline and Processor objects to perform curation of data sets.
12 | 
13 | """
14 | 
15 | from . import chem
16 | from . import fileio
17 | from . import container
18 | from . import utils
19 | from . import peaks
20 | from . import filter
21 | from . import lcms
22 | from . import simulation
23 | from . import raw_data_utils
24 | from . import _mzml
25 | from . import _build_data_matrix
26 | from . import correspondence
27 | from . import fill_missing
28 | from . import consensus_annotation
29 | from .container import DataContainer
30 | from .fileio import MSData
31 | from .lcms import Chromatogram, MSSpectrum
32 | from .assay import Assay
33 | from .raw_data_utils import *
34 | from . import dartms
35 | from .annotation import annotation
36 | 
37 | utils.create_tidyms_dir()
38 | SETTINGS = utils.get_settings()
39 | 
40 | if SETTINGS["bokeh"]["apply_theme"]:
41 |     from bokeh.themes import Theme as _Theme
42 |     from bokeh.io import curdoc as _curdoc
43 |     theme = SETTINGS["bokeh"]["theme"]
44 |     _curdoc().theme = _Theme(json=theme)
45 | 
46 | if utils.is_notebook():
47 |     from bokeh.plotting import output_notebook as _output_notebook
48 | 
49 |     _output_notebook()
50 | 


--------------------------------------------------------------------------------
/src/tidyms/_constants.py:
--------------------------------------------------------------------------------
  1 | from typing import Final, List
  2 | 
  3 | 
  4 | # separation modes
  5 | HPLC: Final[str] = "hplc"
  6 | UPLC: Final[str] = "uplc"
  7 | DART: Final[str] = "None/DART"
  8 | LC_MODES: Final[List[str]] = [UPLC, HPLC, DART]
  9 | SEPARATION_MODES: Final[List[str]] = LC_MODES + []
 10 | 
 11 | # instruments
 12 | QTOF: Final[str] = "qtof"
 13 | ORBITRAP: Final[str] = "orbitrap"
 14 | MS_INSTRUMENTS: Final[List[str]] = [QTOF, ORBITRAP]
 15 | 
 16 | # MS mode
 17 | CENTROID: Final[str] = "centroid"
 18 | PROFILE: Final[str] = "profile"
 19 | MS_MODES: Final[List[str]] = [CENTROID, PROFILE]
 20 | 
 21 | # Data loading
 22 | MEMORY: Final[str] = "memory"
 23 | INFILE: Final[str] = "file"
 24 | SIMULATED: Final[str] = "simulated"
 25 | DATA_LOAD_MODES: Final[List[str]] = [MEMORY, INFILE, SIMULATED]
 26 | DEFAULT_DATA_LOAD_MODE = INFILE
 27 | 
 28 | # feature descriptors
 29 | FEATURE: Final[str] = "feature"
 30 | MZ: Final[str] = "mz"
 31 | RT_START: Final[str] = "rt start"
 32 | RT_END: Final[str] = "rt end"
 33 | RT: Final[str] = "rt"
 34 | RT_STD: Final[str] = "rt std"
 35 | AREA: Final[str] = "area"
 36 | WIDTH: Final[str] = "width"
 37 | HEIGHT: Final[str] = "height"
 38 | SNR: Final[str] = "snr"
 39 | MZ_STD: Final[str] = "mz_std"
 40 | ROI_INDEX: Final[str] = "roi_index"
 41 | FT_INDEX: Final[str] = "ft_index"
 42 | MERGED: Final[str] = "merged"
 43 | 
 44 | # chromatogram names
 45 | BASELINE: Final[str] = "baseline"
 46 | NOISE: Final[str] = "noise"
 47 | SPINT: Final[str] = "spint"  # spectral intensity
 48 | ROI_FEATURE_LIST: Final[str] = "features"
 49 | TIME: Final[str] = "time"
 50 | SCAN: Final[str] = "scan"
 51 | MODE: Final[str] = "mode"
 52 | 
 53 | # peak names
 54 | START: Final[str] = "start"
 55 | APEX: Final[str] = "apex"
 56 | END: Final[str] = "end"
 57 | 
 58 | # isotopologue envelope annotation
 59 | ENVELOPE_LABEL: Final[str] = "envelope_label"
 60 | ENVELOPE_INDEX: Final[str] = "envelope_index"
 61 | CHARGE: Final[str] = "charge"
 62 | 
 63 | # sample metadata
 64 | SAMPLE: Final[str] = "sample"
 65 | CLASS: Final[str] = "class"
 66 | ORDER: Final[str] = "order"
 67 | BATCH: Final[str] = "batch"
 68 | LABEL: Final[str] = "cluster"
 69 | ID: Final[str] = "id"
 70 | DILUTION: Final[str] = "dilution"
 71 | TYPE: Final[str] = "type"
 72 | 
 73 | # sample types
 74 | QC_TYPE: Final[str] = "qc"
 75 | DQC_TYPE: Final[str] = "dqc"
 76 | STUDY_TYPE: Final[str] = "sample"
 77 | BLANK_TYPE: Final[str] = "blank"
 78 | SAMPLE_TYPES: Final[list[str]] = [QC_TYPE, STUDY_TYPE, BLANK_TYPE, DQC_TYPE]
 79 | 
 80 | 
 81 | # assay file and dir names
 82 | ROI_DIR: Final[str] = "roi"
 83 | FT_DIR: Final[str] = "feature"
 84 | MANAGER_FILENAME: Final[str] = "metadata.pickle"
 85 | FT_TABLE_FILENAME: Final[str] = "feature-table.pickle"
 86 | DATA_MATRIX_FILENAME: Final[str] = "data-matrix.pickle"
 87 | 
 88 | # preprocessing steps
 89 | DETECT_FEATURES: Final[str] = "detect_features"
 90 | EXTRACT_FEATURES: Final[str] = "extract_features"
 91 | DESCRIBE_FEATURES: Final[str] = "describe_features"
 92 | ANNOTATE_ISOTOPOLOGUES: Final[str] = "annotate_isotopologues"
 93 | ANNOTATE_ADDUCTS: Final[str] = "annotate_adducts"
 94 | BUILD_FEATURE_TABLE: Final[str] = "build_feature_table"
 95 | MATCH_FEATURES: Final[str] = "match_features"
 96 | MAKE_DATA_MATRIX: Final[str] = "make_data_matrix"
 97 | FILL_MISSING: Final[str] = "fill_missing"
 98 | 
 99 | PREPROCESSING_STEPS: Final[List[str]] = [
100 |     DETECT_FEATURES,
101 |     EXTRACT_FEATURES,
102 |     DESCRIBE_FEATURES,
103 |     ANNOTATE_ISOTOPOLOGUES,
104 |     ANNOTATE_ADDUCTS,
105 |     BUILD_FEATURE_TABLE,
106 |     MATCH_FEATURES,
107 |     MAKE_DATA_MATRIX,
108 |     FILL_MISSING,
109 | ]
110 | 


--------------------------------------------------------------------------------
/src/tidyms/_plot_bokeh.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import bokeh.plotting
  3 | from bokeh.palettes import all_palettes
  4 | from bokeh.models import ColumnDataSource, Segment
  5 | from .utils import get_settings
  6 | from . import _constants as c
  7 | from typing import Dict, Generator, List, Optional
  8 | 
  9 | 
 10 | def get_bokeh_settings():
 11 |     return get_settings()["bokeh"]
 12 | 
 13 | 
 14 | def get_theme_params() -> dict:
 15 |     return get_bokeh_settings()["theme"]
 16 | 
 17 | 
 18 | def get_line_params() -> dict:
 19 |     return get_bokeh_settings()["line"]
 20 | 
 21 | 
 22 | def get_chromatogram_figure_params() -> dict:
 23 |     return get_bokeh_settings()["chromatogram"]["figure"]
 24 | 
 25 | 
 26 | def get_spectrum_figure_params() -> dict:
 27 |     return get_bokeh_settings()["spectrum"]["figure"]
 28 | 
 29 | 
 30 | def get_varea_params() -> dict:
 31 |     return get_bokeh_settings()["varea"]
 32 | 
 33 | 
 34 | def get_palette() -> List[str]:
 35 |     palette_params = get_bokeh_settings()["palette"]
 36 |     return find_palette(**palette_params)
 37 | 
 38 | 
 39 | def make_figure(fig_params: Optional[dict]):
 40 |     if fig_params is None:
 41 |         fig_params = dict()
 42 |     return bokeh.plotting.figure(**fig_params)
 43 | 
 44 | 
 45 | def find_palette(name: str, size: Optional[int] = None) -> List[str]:
 46 |     try:
 47 |         palette = bokeh.palettes.all_palettes[name]
 48 |         # by default get the palette with the largest size
 49 |         if size is None:
 50 |             size = max(list(palette.keys()))
 51 |         palette = palette[size]
 52 |     except KeyError:
 53 |         link = "https://docs.bokeh.org/en/latest/docs/reference/palettes.html"
 54 |         msg = "Palette not found. Refer to the list of prebuilt palettes at {}"
 55 |         raise ValueError(msg.format(link))
 56 |     return palette
 57 | 
 58 | 
 59 | def palette_cycler(palette: List[str]) -> Generator[str, None, None]:
 60 |     ind = 0
 61 |     size = len(palette)
 62 |     while True:
 63 |         yield palette[ind]
 64 |         ind = (ind + 1) % size
 65 | 
 66 | 
 67 | def add_line(
 68 |     figure: bokeh.plotting.figure,
 69 |     x: np.ndarray,
 70 |     y: np.ndarray,
 71 |     line_params: Optional[dict] = None
 72 | ):
 73 |     """
 74 |     Plots a line.
 75 | 
 76 |     Parameters
 77 |     ----------
 78 |     figure : bokeh.plotting.figure
 79 |         key-value parameters to pass into bokeh figure function.
 80 |     x : array
 81 |     y : array
 82 |     line_params : dict
 83 |         key-value parameters to pass into bokeh line function.
 84 | 
 85 |     """
 86 |     default_line_params = get_line_params()
 87 |     if line_params:
 88 |         default_line_params.update(line_params)
 89 |     line_params = default_line_params
 90 |     figure.line(x, y, **line_params)
 91 | 
 92 | 
 93 | def set_chromatogram_axis_params(fig: bokeh.plotting.figure):
 94 |     bokeh_settings = get_bokeh_settings()
 95 |     xaxis_params = bokeh_settings["chromatogram"]["xaxis"]
 96 |     yaxis_params = bokeh_settings["chromatogram"]["yaxis"]
 97 |     fig.xaxis.update(**xaxis_params)
 98 |     fig.yaxis.update(**yaxis_params)
 99 | 
100 | 
101 | def set_ms_spectrum_axis_params(fig: bokeh.plotting.figure):
102 |     bokeh_settings = get_bokeh_settings()
103 |     xaxis_params = bokeh_settings["spectrum"]["xaxis"]
104 |     yaxis_params = bokeh_settings["spectrum"]["yaxis"]
105 |     fig.xaxis.update(**xaxis_params)
106 |     fig.yaxis.update(**yaxis_params)
107 | 
108 | 
109 | def fill_area(
110 |     figure: bokeh.plotting.figure,
111 |     x: np.ndarray,
112 |     y: np.ndarray,
113 |     start: int,
114 |     end: int,
115 |     color: str,
116 |     **varea_params,
117 | ):
118 |     default_varea_params = get_varea_params()
119 |     if varea_params:
120 |         default_varea_params.update(varea_params)
121 |     varea_params = default_varea_params
122 | 
123 |     xp = x[start:end]
124 |     yp = y[start:end]
125 |     figure.varea(xp, yp, 0, fill_color=color, **varea_params)
126 | 
127 | 
128 | def add_stems(
129 |     fig: bokeh.plotting.figure,
130 |     x: np.ndarray,
131 |     y: np.ndarray,
132 |     line_params: Optional[Dict] = None
133 | ):
134 |     default_line_params = get_line_params()
135 |     if line_params:
136 |         default_line_params.update(line_params)
137 |     line_params = default_line_params
138 |     x0 = x
139 |     y0 = np.zeros_like(y)
140 |     source = ColumnDataSource(dict(x0=x0, x1=x, y0=y0, y1=y))
141 |     stems = Segment(x0="x0", x1="x1", y0="y0", y1="y1", **line_params)
142 |     fig.add_glyph(source, stems)
143 | 
144 | 
145 | class _LCAssayPlotter:     # pragma: no cover
146 |     """
147 |     Methods to plot data from an Assay. Generates Bokeh Figures.
148 | 
149 |     Methods
150 |     -------
151 |     roi(sample: str) :
152 |         m/z vs Rt view of the ROI and features in a sample.
153 |     stacked_chromatogram(feature: int) :
154 |         Overlapped chromatographic peaks for a feature in all samples
155 | 
156 |     """
157 |     def __init__(self, assay):
158 |         self.assay = assay
159 |         self.roi_index = None
160 |         self.ft_index = None
161 | 
162 |     def _build_roi_index_table(self):
163 |         ft_table = self.assay.feature_table.copy()
164 |         ft_table = ft_table[ft_table[c.LABEL] > -1]
165 |         self.roi_index = (
166 |             ft_table.pivot(index=c.SAMPLE, columns=c.LABEL, values=c.ROI_INDEX)
167 |             .fillna(-1)
168 |             .astype(int)
169 |         )
170 | 
171 |     def _build_peak_index_table(self):
172 |         ft_table = self.assay.feature_table.copy()
173 |         ft_table = ft_table[ft_table[c.LABEL] > -1]
174 |         self.ft_index = (
175 |             ft_table.pivot(index=c.SAMPLE, columns=c.LABEL, values=c.FT_INDEX)
176 |             .fillna(-1)
177 |             .astype(int)
178 |         )
179 | 
180 |     def roi(self, sample: str, show: bool = True) -> bokeh.plotting.figure:
181 |         """
182 |         Plots m/z vs time dispersion of the ROI in a sample. Detected features
183 |         are highlighted using circles.
184 | 
185 |         Parameters
186 |         ----------
187 |         sample : str
188 |             sample used in the Assay.
189 |         show : bool, default=True
190 |             If True calls ``bokeh.plotting.show`` on the Figure.
191 | 
192 |         Returns
193 |         -------
194 |         bokeh Figure
195 |         """
196 |         roi = self.assay.load_roi_list(sample)
197 | 
198 |         TOOLTIPS = [
199 |             ("m/z", "@{}".format(c.MZ)),
200 |             ("Rt", "@{}".format(c.RT)),
201 |             ("area", "@{}".format(c.AREA)),
202 |             ("height", "@{}".format(c.HEIGHT)),
203 |             ("width", "@{}".format(c.WIDTH)),
204 |             ("SNR", "@{}".format(c.SNR)),
205 |             ("roi index", "@{}".format(c.ROI_INDEX)),
206 |             ("feature index", "@{}".format(c.FT_INDEX))
207 |         ]
208 |         fig = bokeh.plotting.figure(tooltips=TOOLTIPS)
209 | 
210 |         rt_list = list()
211 |         mz_list = list()
212 |         for r in roi:
213 |             rt_list.append(r.time)
214 |             mz_list.append(r.mz)
215 |         line_source = bokeh.plotting.ColumnDataSource(
216 |             dict(xs=rt_list, ys=mz_list)
217 |         )
218 |         line_params = get_line_params()
219 |         fig.multi_line(xs="xs", ys="ys", source=line_source, **line_params)
220 | 
221 |         try:
222 |             ft = self.assay.load_features(sample)
223 |             source = bokeh.plotting.ColumnDataSource(ft)
224 |             fig.circle('rt', 'mz', size=5, source=source)
225 |         except ValueError:
226 |             pass
227 |         fig.xaxis.update(axis_label="Rt [s]")
228 |         fig.yaxis.update(axis_label="m/z")
229 |         if show:
230 |             bokeh.plotting.show(fig)
231 |         return fig
232 | 
233 |     def stacked_chromatogram(
234 |         self,
235 |         cluster: int,
236 |         include_classes: Optional[List[str]] = None,
237 |         show: bool = True
238 |     ) -> bokeh.plotting.figure:
239 |         """
240 |         Plots chromatograms of a feature detected across different samples.
241 | 
242 |         Parameters
243 |         ----------
244 |         cluster : int
245 |             cluster value obtained from feature correspondence.
246 |         include_classes : List[str] or None, default=None
247 |             List of classes to plot. If None is used, samples from all classes
248 |             are plotted.
249 |         show : bool, default=True
250 |             If True calls ``bokeh.plotting.show`` on the Figure.
251 | 
252 |         Returns
253 |         -------
254 |         bokeh Figure
255 | 
256 |         """
257 |         if not self.assay.manager.check_step("match_features"):
258 |             msg = "This plot only can be generated after feature matching"
259 |             raise ValueError(msg)
260 |         else:
261 |             if self.ft_index is None:
262 |                 self._build_peak_index_table()
263 | 
264 |             if self.roi_index is None:
265 |                 self._build_roi_index_table()
266 | 
267 |         fig_params = get_chromatogram_figure_params()
268 |         fig = bokeh.plotting.figure(**fig_params)
269 |         roi_index = self.roi_index[cluster].to_numpy()
270 |         ft_index = self.ft_index[cluster].to_numpy()
271 |         samples = self.roi_index.index
272 |         # TODO: fix after refactoring DataContainers
273 |         classes = self.assay.get_sample_metadata()["class"]
274 |         palette = get_palette()
275 |         if include_classes is not None:
276 |             class_to_color = dict()
277 |             for k, cl in enumerate(include_classes):
278 |                 class_to_color[cl] = palette[k]
279 | 
280 |         iterator = zip(samples, roi_index, ft_index, classes)
281 |         for sample, roi_index, ft_index, class_ in iterator:
282 |             check_draw = (
283 |                 (roi_index > -1) and
284 |                 ((include_classes is None) or (class_ in include_classes))
285 |             )
286 |             if check_draw:
287 |                 if include_classes is None:
288 |                     color = palette[0]
289 |                 else:
290 |                     color = class_to_color[class_]
291 |                 r = self.assay.load_roi(sample, roi_index)
292 |                 ft = r.features[ft_index]
293 |                 add_line(fig, r.time, r.spint)
294 |                 fill_area(
295 |                     fig, r.time, r.spint, ft.start, ft.end, color, alpha=0.2)
296 |         set_chromatogram_axis_params(fig)
297 |         if show:
298 |             bokeh.plotting.show(fig)
299 |         return fig
300 | 


--------------------------------------------------------------------------------
/src/tidyms/annotation/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Annotation
 3 | ----------
 4 | 
 5 | Tools for feature annotation.
 6 | 
 7 | Provides:
 8 | 
 9 | 1. Tools for isotopologue annotation.
10 | 
11 | Functions
12 | ---------
13 | annotate
14 | create_annotation_table
15 | create_annotation_tools
16 | 
17 | 
18 | """
19 | 
20 | from .annotation import annotate, create_annotation_table, create_annotation_tools
21 | 
22 | __all__ = ["annotate", "create_annotation_tools", "create_annotation_table"]
23 | 


--------------------------------------------------------------------------------
/src/tidyms/annotation/annotation.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from typing import Sequence
  4 | from .annotation_data import AnnotationData
  5 | from .envelope_finder import EnvelopeFinder
  6 | from .mmi_finder import MMIFinder
  7 | from ..lcms import Feature
  8 | from ..chem import EnvelopeValidator
  9 | from ..chem.atoms import EM, PeriodicTable
 10 | from .. import _constants as c
 11 | 
 12 | 
 13 | def create_annotation_table(feature_list: list[Feature]) -> pd.DataFrame:
 14 |     d: dict[str, list[int]] = {
 15 |         c.ROI_INDEX: list(),
 16 |         c.FT_INDEX: list(),
 17 |         c.ENVELOPE_INDEX: list(),
 18 |         c.ENVELOPE_LABEL: list(),
 19 |         c.CHARGE: list(),
 20 |     }
 21 | 
 22 |     for ft in feature_list:
 23 |         annotation = ft.annotation
 24 |         d[c.CHARGE].append(annotation.charge)
 25 |         d[c.ENVELOPE_INDEX].append(annotation.isotopologue_index)
 26 |         d[c.ENVELOPE_LABEL].append(annotation.isotopologue_label)
 27 |         d[c.ROI_INDEX].append(ft.roi.index)
 28 |         d[c.FT_INDEX].append(ft.index)
 29 | 
 30 |     return pd.DataFrame(d)
 31 | 
 32 | 
 33 | def create_annotation_tools(
 34 |     bounds: dict[str, tuple[int, int]],
 35 |     max_mass: float,
 36 |     max_charge: int,
 37 |     max_length: int,
 38 |     min_M_tol: float,
 39 |     max_M_tol: float,
 40 |     p_tol: float,
 41 |     min_similarity: float,
 42 |     min_p: float,
 43 | ) -> tuple[MMIFinder, EnvelopeFinder, EnvelopeValidator]:
 44 |     """
 45 |     Create an annotator object. Auxiliary function to _annotate
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     bounds : Dict
 50 |         A dictionary of expected elements to minimum and maximum formula coefficients.
 51 |     max_mass : float
 52 |         Maximum exact mass of the features.
 53 |     max_charge : int
 54 |         Maximum charge of the features. Use negative values for negative polarity.
 55 |     max_length : int
 56 |         Maximum length of the envelopes.
 57 |     min_M_tol : float
 58 |         Minimum mass tolerance used during search. isotopologues with abundance
 59 |         equal to 1 use this value. Isotopologues with abundance equal to 0 use
 60 |         `max_M_tol`. For values in between, a weighted tolerance is used based
 61 |         on the abundance.
 62 |     max_M_tol : float
 63 |     p_tol : float
 64 |         Abundance tolerance.
 65 |     min_similarity : float
 66 |         Minimum cosine similarity between a pair of features
 67 |     min_p : float
 68 |         Minimum abundance of isotopes to include in candidate search.
 69 | 
 70 |     Returns
 71 |     -------
 72 |     annotator: _IsotopologueAnnotator
 73 | 
 74 |     """
 75 |     # remove elements with only 1 stable isotope
 76 |     p_table = PeriodicTable()
 77 |     bounds = {k: bounds[k] for k in bounds if len(p_table.get_element(k).isotopes) > 1}
 78 | 
 79 |     bin_size = 100
 80 |     elements = list(bounds)
 81 |     mmi_finder = MMIFinder(
 82 |         bounds,
 83 |         max_mass,
 84 |         max_charge,
 85 |         max_length,
 86 |         bin_size,
 87 |         max_M_tol,
 88 |         p_tol,
 89 |         min_similarity,
 90 |     )
 91 |     envelope_finder = EnvelopeFinder(elements, max_M_tol, max_length, min_p, min_similarity)
 92 |     envelope_validator = EnvelopeValidator(
 93 |         bounds,
 94 |         max_M=max_mass,
 95 |         max_length=max_length,
 96 |         min_M_tol=min_M_tol,
 97 |         max_M_tol=max_M_tol,
 98 |         p_tol=p_tol,
 99 |     )
100 |     return mmi_finder, envelope_finder, envelope_validator
101 | 
102 | 
103 | def annotate(
104 |     feature_list: list[Feature],
105 |     mmi_finder: MMIFinder,
106 |     envelope_finder: EnvelopeFinder,
107 |     envelope_validator: EnvelopeValidator,
108 | ) -> None:
109 |     """
110 |     Annotate isotopologues in a sample.
111 | 
112 |     Annotations are added to the `annotation` attribute of each feature.
113 | 
114 |     Parameters
115 |     ----------
116 |     feature_list : List[LCTrace]
117 |         List of features obtained after feature extraction.
118 |     mmi_finder : MMIFinder
119 |     envelope_finder : EnvelopeFinder
120 |     envelope_validator : EnvelopeValidator
121 | 
122 |     """
123 |     data = AnnotationData(feature_list)
124 |     monoisotopologue = data.get_monoisotopologue()
125 |     polarity = mmi_finder.polarity
126 |     while monoisotopologue is not None:
127 |         mmi_candidates = mmi_finder.find(data)
128 |         envelope, charge = find_best_envelope(
129 |             data,
130 |             monoisotopologue,
131 |             polarity,
132 |             mmi_candidates,
133 |             envelope_finder,
134 |             envelope_validator,
135 |         )
136 |         data.annotate(envelope, charge)
137 |         monoisotopologue = data.get_monoisotopologue()
138 | 
139 | 
140 | def find_best_envelope(
141 |     data: AnnotationData,
142 |     monoisotopologue: Feature,
143 |     polarity: int,
144 |     mmi_candidates: Sequence[tuple[Feature, int]],
145 |     envelope_finder: EnvelopeFinder,
146 |     envelope_validator: EnvelopeValidator,
147 | ) -> tuple[Sequence[Feature], int]:
148 |     best_length = 1
149 |     best_candidate = [monoisotopologue]
150 |     best_charge = -1
151 |     for mmi, charge in mmi_candidates:
152 |         envelope_candidates = envelope_finder.find(data, mmi, charge)
153 |         for candidate in envelope_candidates:
154 |             validated_length = _validate_candidate(
155 |                 candidate,
156 |                 monoisotopologue,
157 |                 charge,
158 |                 polarity,
159 |                 best_length,
160 |                 envelope_validator,
161 |             )
162 |             if validated_length > best_length:
163 |                 best_length = validated_length
164 |                 best_candidate = candidate[:validated_length]
165 |                 best_charge = charge
166 |     return best_candidate, best_charge
167 | 
168 | 
169 | def _validate_candidate(
170 |     candidate: Sequence[Feature],
171 |     monoisotopologue: Feature,
172 |     charge: int,
173 |     polarity: int,
174 |     min_length: int,
175 |     validator: EnvelopeValidator,
176 | ) -> int:
177 |     if len(candidate) <= min_length:
178 |         return 0
179 | 
180 |     if monoisotopologue not in candidate:
181 |         return 0
182 | 
183 |     M, p = candidate[0].compute_isotopic_envelope(candidate)
184 |     em_correction = EM * charge * polarity
185 |     M = np.array(M) * charge - em_correction
186 |     p = np.array(p)
187 |     return validator.validate(M, p)
188 | 


--------------------------------------------------------------------------------
/src/tidyms/annotation/annotation_data.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | from ..lcms import Feature
 3 | from collections.abc import Sequence
 4 | 
 5 | 
 6 | class AnnotationData:
 7 |     """
 8 |     Feature data.
 9 | 
10 |     Attributes
11 |     ----------
12 |     features : list[Feature]
13 |         List of features sorted by m/z
14 |     annotation : dict[Feature, int]
15 |         Annotation of features
16 |     similarity_cache : SimilarityCache
17 |         Stores similarity between features.
18 |     non_annotated : set[Feature]
19 |         Non-annotated features.
20 | 
21 |     """
22 | 
23 |     def __init__(self, features: Sequence[Feature]):
24 |         self.features = sorted(features)
25 |         self.non_annotated = set(features)
26 |         self._monoisotopologues = sorted(features, key=lambda x: x.height)
27 |         self.similarity_cache = SimilarityCache()
28 |         self._label_counter = 0
29 | 
30 |     def get_monoisotopologue(self) -> Optional[Feature]:
31 |         """Gets the current non-annotated feature with the greatest area."""
32 |         if self._monoisotopologues:
33 |             mono = self._monoisotopologues[-1]
34 |             while mono not in self.non_annotated:
35 |                 self._monoisotopologues.pop()
36 |                 if self._monoisotopologues:
37 |                     mono = self._monoisotopologues[-1]
38 |                 else:
39 |                     mono = None
40 |         else:
41 |             mono = None
42 |         return mono
43 | 
44 |     def annotate(self, features: Sequence[Feature], charge: int):
45 |         """Labels a list of features as an isotopic envelope."""
46 |         if len(features) > 1:
47 |             for k, ft in enumerate(features):
48 |                 ft.annotation.charge = charge
49 |                 ft.annotation.isotopologue_label = self._label_counter
50 |                 ft.annotation.isotopologue_index = k
51 |                 self._flag_annotated(ft)
52 |             self._label_counter += 1
53 |         else:
54 |             self._flag_annotated(features[0])
55 | 
56 |     def _flag_annotated(self, feature: Feature):
57 |         """Flag features as annotated."""
58 |         self.non_annotated.discard(feature)
59 |         if self._monoisotopologues and (feature == self._monoisotopologues[-1]):
60 |             self._monoisotopologues.pop()
61 | 
62 | 
63 | class SimilarityCache:
64 |     """Stores and retrieves the similarity between features in a sample."""
65 | 
66 |     def __init__(self):
67 |         self._cache: dict[Feature, dict[Feature, float]] = dict()
68 | 
69 |     def get_similarity(self, ft1: Feature, ft2: Feature):
70 |         ft1_sim = self._cache.setdefault(ft1, dict())
71 |         ft2_sim = self._cache.setdefault(ft2, dict())
72 |         if ft2 in ft1_sim:
73 |             similarity = ft1_sim[ft2]
74 |         else:
75 |             similarity = ft1.compare(ft2)
76 |             ft1_sim[ft2] = similarity
77 |             ft2_sim[ft1] = similarity
78 |         return similarity
79 | 


--------------------------------------------------------------------------------
/src/tidyms/annotation/envelope_finder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Functions to find isotopic envelopes candidates in a list of m/z values.
  3 | 
  4 | """
  5 | 
  6 | 
  7 | import bisect
  8 | from typing import Tuple
  9 | from ..chem.atoms import Element, PeriodicTable
 10 | from ..lcms import Feature
 11 | from .annotation_data import AnnotationData, SimilarityCache
 12 | from collections.abc import Sequence
 13 | 
 14 | # name conventions
 15 | # M is used for Molecular mass
 16 | # m for nominal mass
 17 | # p for abundances
 18 | 
 19 | 
 20 | class EnvelopeFinder(object):
 21 |     r"""
 22 |     Find isotopic envelopes candidates in a list of sorted m/z values.
 23 | 
 24 |     Attributes
 25 |     ----------
 26 |     tolerance : float
 27 |         tolerance used to extend the element based bounds
 28 |     max_length : int
 29 |         max length of the envelopes
 30 | 
 31 |     Notes
 32 |     -----
 33 |     Using a list of elements, theoretical bounds are computed for each M1, M2,
 34 |     M3, etc... isotopologue. Then using these values and the `mz_tolerance` and
 35 |     the `max_charge`, the bounds are adjusted according to the following
 36 |     equations:
 37 | 
 38 |     .. math::
 39 | 
 40 |         mz_{k, min}= \frac{m_{k, min}{q} - mz_{tolerance}
 41 | 
 42 |         mz_{k, max}= \frac{m_{k, max}{q} + mz_{tolerance}
 43 | 
 44 |     where :math:`m_{k, min}` is the minimum theoretical value for the k-th
 45 |     isotopologue and q is the charge.
 46 | 
 47 |     The envelopes candidates found are determined based on m/z compatibility
 48 |     only. To reduce the number of candidates, the list of m/z values should be
 49 |     reduced by other means, such as correlation of the values.
 50 | 
 51 |     """
 52 | 
 53 |     def __init__(
 54 |         self,
 55 |         elements: list[str],
 56 |         mz_tolerance: float,
 57 |         max_length: int = 5,
 58 |         min_p: float = 0.01,
 59 |         min_similarity: float = 0.9,
 60 |     ):
 61 |         """
 62 | 
 63 |         Parameters
 64 |         ----------
 65 |         elements : List[str]
 66 |             List of elements used to compute mass difference windows.
 67 |         mz_tolerance : float
 68 |             m/z tolerance used to match candidates.
 69 |         max_length : int, default=5
 70 |             Maximum envelope length to search.
 71 |         min_p : number between 0 and 1.
 72 |             The minimum abundance of the isotopes of each element to be used for m/z estimation.
 73 |         min_similarity : float, default=0.9
 74 |             Minimum similarity to create candidates.
 75 | 
 76 |         """
 77 | 
 78 |         el_list = [PeriodicTable().get_element(x) for x in elements]
 79 |         self.tolerance = mz_tolerance
 80 |         self.max_length = max_length
 81 |         self.min_similarity = min_similarity
 82 |         self.bounds = _make_exact_mass_difference_bounds(el_list, min_p)
 83 | 
 84 |     def find(
 85 |         self,
 86 |         data: AnnotationData,
 87 |         mmi: Feature,
 88 |         charge: int,
 89 |     ) -> list[Sequence[Feature]]:
 90 |         """
 91 |         Finds isotopic envelope candidates starting from the minimum mass
 92 |         isotopologue (MMI).
 93 | 
 94 |         Parameters
 95 |         ----------
 96 |         data : AnnotationData
 97 |             List of features sorted by m/z.
 98 |         mmi : Feature
 99 |             Minimum Mass feature.
100 |         non_annotated : set[Feature]
101 |             Non annotated features
102 |         charge : int
103 |             Absolute value of the charge state of the isotopic envelope
104 | 
105 |         Returns
106 |         -------
107 |         envelopes: list[list[Feature]]
108 |             List of isotopic envelope candidates.
109 | 
110 |         """
111 |         envelopes = _find_envelopes(
112 |             data.features,
113 |             mmi,
114 |             data.non_annotated,
115 |             data.similarity_cache,
116 |             charge,
117 |             self.max_length,
118 |             self.tolerance,
119 |             self.min_similarity,
120 |             self.bounds,
121 |         )
122 |         envelopes = _remove_sub_candidates(envelopes)
123 |         return envelopes
124 | 
125 | 
126 | def _remove_sub_candidates(
127 |     candidates: list[Sequence[Feature]],
128 | ) -> list[Sequence[Feature]]:
129 |     """Remove candidates that are subsets of other candidates."""
130 |     validated = list()
131 |     while candidates:
132 |         last = candidates.pop()
133 |         last_set = set(last)
134 |         is_subset = False
135 |         for candidate in candidates:
136 |             is_subset = last_set <= set(candidate)
137 |         if not is_subset:
138 |             validated.append(last)
139 |     return validated
140 | 
141 | 
142 | def _find_envelopes(
143 |     features: Sequence[Feature],
144 |     mmi: Feature,
145 |     non_annotated: set[Feature],
146 |     cache: SimilarityCache,
147 |     charge: int,
148 |     max_length: int,
149 |     mz_tolerance: float,
150 |     min_similarity: float,
151 |     bounds: dict[int, Tuple[float, float]],
152 | ) -> list[Sequence[Feature]]:
153 |     """
154 | 
155 |     Finds isotopic envelope candidates using multiple charge states.
156 | 
157 |     Parameters
158 |     ----------
159 |     features: list[Feature]
160 |         List of features sorted by m/z.
161 |     mmi: Feature
162 |         Minimum Mass feature.
163 |     non_annotated: set[Feature]
164 |         Non annotated features
165 |     charge: int
166 |         Absolute value of the charge state of the isotopic envelope
167 |     max_length: int
168 |         maximum length ot the isotope candidates
169 |     mz_tolerance: float
170 |     min_similarity : float, default=0.9
171 |             Minimum similarity to create candidates.
172 |     bounds: dict
173 |         bounds obtained with _make_m_bounds
174 | 
175 |     Returns
176 |     -------
177 |     envelopes:
178 |         List where each element is a list of indices with isotopic envelopes
179 |         candidates.
180 | 
181 |     """
182 |     completed_candidates = list()
183 |     candidates = [[mmi]]
184 |     while candidates:
185 |         # remove and extend a candidate
186 |         candidate = candidates.pop()
187 | 
188 |         # find features with compatible m/z and similarities
189 |         min_mz, max_mz = _get_next_mz_search_interval(
190 |             candidate, bounds, charge, mz_tolerance
191 |         )
192 |         start = bisect.bisect(features, min_mz)
193 |         end = bisect.bisect(features, max_mz)
194 |         new_features = list()
195 |         for k in range(start, end):
196 |             k_ft = features[k]
197 |             is_similar = cache.get_similarity(mmi, k_ft) >= min_similarity
198 |             is_non_annotated = k_ft in non_annotated
199 |             if is_similar and is_non_annotated:
200 |                 new_features.append(k_ft)
201 | 
202 |         # extend candidates with compatible features
203 |         length = len(candidate)
204 |         if new_features and (length < max_length):
205 |             tmp = [candidate + [x] for x in new_features]
206 |             candidates.extend(tmp)
207 |         else:
208 |             completed_candidates.append(candidate)
209 |     completed_candidates = [x for x in completed_candidates if len(x) > 1]
210 |     return completed_candidates
211 | 
212 | 
213 | def _get_next_mz_search_interval(
214 |     envelope: Sequence[Feature],
215 |     elements_mass_difference: dict[int, Tuple[float, float]],
216 |     charge: int,
217 |     mz_tolerance: float,
218 | ) -> Tuple[float, float]:
219 |     """
220 |     Computes the valid m/z range for a k-th isotopologue using information from
221 |     m/z values from previous isotopologues.
222 | 
223 |     Parameters
224 |     ----------
225 |     mz: sorted list
226 |         List of previous found m/z values
227 |     elements_mass_difference: dict
228 |         bounds obtained with _make_m bounds
229 |     charge: int
230 |     mz_tolerance: float
231 | 
232 |     Returns
233 |     -------
234 |     min_mz: minimum mz value for the M + k isotopologue
235 |     max_mz: maximum mz value for the M + K isotopologue
236 | 
237 |     """
238 | 
239 |     # If the charge is 0 (neutral mass) the results are the same as using
240 |     # charge = 1. There is no difference between positive and negative
241 |     # charges
242 |     charge = max(1, abs(charge))
243 |     length = len(envelope)
244 |     min_mz = envelope[-1].mz + 2  # dummy values
245 |     max_mz = envelope[-1].mz - 2
246 |     for dm, (min_dM, max_dM) in elements_mass_difference.items():
247 |         i = length - dm
248 |         if i >= 0:
249 |             min_mz = min(min_mz, envelope[i].mz + min_dM / charge)
250 |             max_mz = max(max_mz, envelope[i].mz + max_dM / charge)
251 |     min_mz -= mz_tolerance
252 |     max_mz += mz_tolerance
253 |     return min_mz, max_mz
254 | 
255 | 
256 | def _make_exact_mass_difference_bounds(
257 |     elements: list[Element], min_p: float
258 | ) -> dict[int, Tuple[float, float]]:
259 |     """
260 |     Computes possible mass differences obtaining from changing one isotope.
261 | 
262 |     Parameters
263 |     ----------
264 |     elements: list of Elements
265 |     min_p: number between 0 and 1.
266 |         Minimum abundance of the isotopes used.
267 | 
268 |     Returns
269 |     -------
270 |     bounds: dict
271 |         mapping of possible nominal mass increments to exact mass increments,
272 |         used by _get_k_bounds to estimate valid m/z ranges for isotopologues.
273 | 
274 |     """
275 |     bounds = dict()
276 |     for e in elements:
277 |         m, M, p = e.get_abundances()
278 |         for i in range(1, M.size):
279 |             if p[i] > min_p:
280 |                 dm = m[i] - m[0]
281 |                 dM = M[i] - M[0]
282 |                 dM_list = bounds.get(dm)
283 |                 if dM_list is None:
284 |                     bounds[dm] = [dM]
285 |                 else:
286 |                     dM_list.append(dM)
287 | 
288 |     for dm in bounds:
289 |         bounds[dm] = min(bounds[dm]), max(bounds[dm])
290 |     return bounds
291 | 


--------------------------------------------------------------------------------
/src/tidyms/annotation/mmi_finder.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import bisect
  3 | from typing import Optional
  4 | from .annotation_data import AnnotationData
  5 | from ..chem.atoms import Element, PeriodicTable, EM
  6 | from ..chem._formula_generator import FormulaCoefficientBounds
  7 | from ..chem.envelope_tools import make_formula_coefficients_envelopes
  8 | from ..lcms import Feature
  9 | 
 10 | 
 11 | class MMIFinder:
 12 |     """
 13 |     Finds Minimum Mass Isotopologue (MMI) candidates using an array of feature
 14 |     m/z and an array of feature area.
 15 | 
 16 |     """
 17 | 
 18 |     def __init__(
 19 |         self,
 20 |         bounds: dict[str, tuple[int, int]],
 21 |         max_mass: float,
 22 |         max_charge: int,
 23 |         length: int,
 24 |         bin_size: int,
 25 |         mz_tol: float,
 26 |         p_tol: float,
 27 |         min_similarity: float,
 28 |         custom_abundances: Optional[dict[str, np.ndarray]] = None,
 29 |     ):
 30 |         """
 31 |         Constructor method.
 32 | 
 33 |         Parameters
 34 |         ----------
 35 |         bounds : dict
 36 |             Mapping from an element symbol str to the minimum and maximum
 37 |             allowed values in formulas.
 38 |         max_mass : float
 39 |             Maximum mass to build rules.
 40 |         length : int
 41 |             length of the theoretical envelopes used to compute the search
 42 |             rules.
 43 |         bin_size : int
 44 |             Mass interval used to build the rules.
 45 |         mz_tol : float
 46 |             m/z tolerance to search candidates.
 47 |         p_tol : float
 48 |             abundance tolerance used to search candidates.
 49 |         min_similarity : float, default=0.9
 50 |             Minimum similarity to create candidates.
 51 |         custom_abundances : dict, optional
 52 |             Provides custom elemental abundances. A mapping from element
 53 |             symbols str to an abundance array. The abundance array must have
 54 |             the same size that the natural abundance and its sum must be equal
 55 |             to one. For example, for "C", an alternative abundance can be
 56 |             array([0.15, 0.85]) for isotopes with nominal mass 12 and 13.
 57 | 
 58 |         """
 59 |         self.rules = _create_rules_dict(
 60 |             bounds, max_mass, length, bin_size, p_tol, custom_abundances
 61 |         )
 62 |         self.bin_size = bin_size
 63 |         self.max_charge = abs(max_charge)
 64 |         self.polarity = 1 if max_charge >= 0 else -1
 65 |         self.max_mass = max_mass
 66 |         self.mz_tol = mz_tol
 67 |         self.p_tol = p_tol
 68 |         self.min_similarity = min_similarity
 69 | 
 70 |     def find(self, data: AnnotationData) -> list[tuple[Feature, int]]:
 71 |         """
 72 |         Search MMI candidates using m/z and area information from a feature
 73 |         list.
 74 | 
 75 |         Parameters
 76 |         ----------
 77 |         features : list[Features]
 78 |             list of features sorted by m/z.
 79 |         mono: Feature
 80 |             Candidate to monoisotopic feature.
 81 | 
 82 |         Returns
 83 |         -------
 84 |         mmi_candidates: list[tuple[int, int]]
 85 |             list of candidates assuming that the monoisotopic index is part of
 86 |             the envelope but not the MMI.
 87 | 
 88 |         """
 89 |         mono = data.get_monoisotopologue()
 90 |         candidates = list()
 91 | 
 92 |         if mono is None:
 93 |             return candidates
 94 | 
 95 |         for charge in range(1, self.max_charge + 1):
 96 |             M_mono = mono.mz * charge - self.polarity * charge * EM
 97 |             if M_mono < self.max_mass:
 98 |                 candidates.append((mono, charge))
 99 |             M_bin = int(M_mono // self.bin_size)
100 |             mmi_rules = self.rules.get(M_bin)
101 |             if mmi_rules is not None:
102 |                 for i_rules in mmi_rules:
103 |                     i_candidates = _find_candidate(
104 |                         data,
105 |                         mono,
106 |                         charge,
107 |                         i_rules,
108 |                         self.mz_tol,
109 |                         self.p_tol,
110 |                         self.max_mass,
111 |                         self.min_similarity,
112 |                     )
113 |                     candidates.extend(i_candidates)
114 |         return candidates
115 | 
116 | 
117 | def _find_candidate(
118 |     data: AnnotationData,
119 |     mono: Feature,
120 |     charge: int,
121 |     i_rules: dict,
122 |     mz_tol: float,
123 |     p_tol: float,
124 |     max_mass: float,
125 |     min_similarity: float,
126 | ) -> list[tuple[int, int]]:
127 |     # search valid m/z values
128 |     min_dM, max_dM = i_rules["dM"]
129 |     min_mz = mono.mz - max_dM / charge - mz_tol
130 |     max_mz = mono.mz - min_dM / charge + mz_tol
131 |     min_qp = i_rules["qp"][0] - p_tol
132 |     max_qp = i_rules["qp"][1] + p_tol
133 | 
134 |     if (mono.mz * charge) < max_mass:
135 |         start = bisect.bisect(data.features, min_mz)
136 |         end = bisect.bisect(data.features, max_mz)
137 |     else:
138 |         start, end = 0, 0  # dummy values
139 | 
140 |     # if valid m/z where found, check if the abundance quotient qp is valid
141 |     candidates = list()
142 |     if start < end:
143 |         for k in range(start, end):
144 |             candidate = data.features[k]
145 |             is_valid = _check_candidate(data, mono, candidate, min_similarity, min_qp, max_qp)
146 |             if is_valid:
147 |                 candidates.append((candidate, charge))
148 |     return candidates
149 | 
150 | 
151 | def _check_candidate(
152 |     data: AnnotationData,
153 |     mono: Feature,
154 |     candidate: Feature,
155 |     min_similarity: float,
156 |     min_qp: float,
157 |     max_qp: float,
158 | ) -> bool:
159 |     if candidate not in data.non_annotated:
160 |         return False
161 | 
162 |     similarity = data.similarity_cache.get_similarity(mono, candidate)
163 | 
164 |     if similarity < min_similarity:
165 |         return False
166 | 
167 |     mmi_mono_pair = [candidate, mono]
168 |     _, p = mono.compute_isotopic_envelope(mmi_mono_pair)
169 |     qp = p[1] / p[0]
170 |     is_valid_qp = (qp >= min_qp) & (qp <= max_qp)
171 | 
172 |     return is_valid_qp
173 | 
174 | 
175 | def _create_rules_dict(
176 |     bounds: dict[str, tuple[int, int]],
177 |     max_mass: float,
178 |     length: int,
179 |     bin_size: int,
180 |     p_tol: float,
181 |     custom_abundances: Optional[dict[str, np.ndarray]],
182 | ) -> dict[int, list[dict[str, tuple[float, float]]]]:
183 |     Ma, pa = _create_envelope_arrays(bounds, max_mass, length, custom_abundances)
184 |     # find the monoisotopic index, its Mass difference with the MMI (dM) and
185 |     # its abundance quotient with the MMI (qp)
186 |     bins = (Ma[:, 0] // bin_size).astype(int)
187 | 
188 |     # find unique values for bins and monoisotopic index that will be used
189 |     # as key for the rule dictionary
190 |     unique_bins = np.unique(bins)
191 |     # unique_mono_index = np.unique(mono_index)
192 |     # unique_mono_index = unique_mono_index[unique_mono_index > 0]
193 | 
194 |     rules = dict()
195 |     for b in unique_bins:
196 |         b_rules = list()
197 |         bin_mask = bins == b
198 |         for mi in range(1, length):
199 |             qp = pa[bin_mask, mi] / pa[bin_mask, 0]
200 |             dM = Ma[bin_mask, mi] - Ma[bin_mask, 0]
201 |             qp_mask = qp >= (1.0 - p_tol)
202 |             if qp_mask.any():
203 |                 mi_rules = dict()
204 |                 dM_b_mi = dM[qp_mask]
205 |                 qp_b_mi = qp[qp_mask]
206 |                 mi_rules["dM"] = dM_b_mi.min(), dM_b_mi.max()
207 |                 mi_rules["qp"] = qp_b_mi.min(), qp_b_mi.max()
208 |                 b_rules.append(mi_rules)
209 |         if b_rules:
210 |             rules[b] = b_rules
211 |     return rules
212 | 
213 | 
214 | def _create_envelope_arrays(
215 |     bounds: dict[str, tuple[int, int]],
216 |     M_max: float,
217 |     max_length: int,
218 |     custom_abundances: Optional[dict[str, np.ndarray]],
219 | ) -> tuple[np.ndarray, np.ndarray]:
220 |     elements = _select_elements(list(bounds), custom_abundances)
221 |     isotopes = [x.get_mmi() for x in elements]
222 |     f_bounds = FormulaCoefficientBounds({x: bounds[x.get_symbol()] for x in isotopes})
223 |     coeff = f_bounds.make_coefficients(M_max)
224 |     envelope = make_formula_coefficients_envelopes(
225 |         bounds, coeff, max_length, custom_abundances
226 |     )
227 |     M = envelope.M
228 |     p = envelope.p
229 |     return M, p
230 | 
231 | 
232 | def _select_two_isotope_element(
233 |     e_list: list[str], dm: int, custom_abundances: dict[str, np.ndarray]
234 | ) -> list[str]:
235 |     selected = list()
236 |     p_dm_max = 0
237 |     best_p0_greater_than_pi = None
238 |     for s in e_list:
239 |         e = PeriodicTable().get_element(s)
240 |         n_isotopes = len(e.isotopes)
241 |         m, _, p = e.get_abundances()
242 |         if n_isotopes == 2:
243 |             e_dm = m[-1] - m[0]
244 |             if e_dm == dm:
245 |                 p0, pi = custom_abundances.get(s, p)
246 |                 if pi > p0:
247 |                     selected.append(s)
248 |                 elif pi > p_dm_max:
249 |                     p_dm_max = pi
250 |                     best_p0_greater_than_pi = s
251 |     if best_p0_greater_than_pi is not None:
252 |         selected.append(best_p0_greater_than_pi)
253 |     return selected
254 | 
255 | 
256 | def _select_multiple_isotope_elements(e_list: list[str]) -> list[str]:
257 |     selected = list()
258 |     for s in e_list:
259 |         e = PeriodicTable().get_element(s)
260 |         n_isotopes = len(e.isotopes)
261 |         if n_isotopes > 2:
262 |             selected.append(s)
263 |     return selected
264 | 
265 | 
266 | def _select_elements(
267 |     e_list: list[str], custom_abundances: Optional[dict[str, np.ndarray]] = None
268 | ) -> list[Element]:
269 |     if custom_abundances is None:
270 |         custom_abundances = dict()
271 |     two_isotope_dm1 = _select_two_isotope_element(e_list, 1, custom_abundances)
272 |     two_isotope_dm2 = _select_two_isotope_element(e_list, 2, custom_abundances)
273 |     selected = _select_multiple_isotope_elements(e_list)
274 |     if two_isotope_dm1 is not None:
275 |         selected.extend(two_isotope_dm1)
276 |     if two_isotope_dm2 is not None:
277 |         selected.extend(two_isotope_dm2)
278 |     selected = [PeriodicTable().get_element(x) for x in selected]
279 |     return selected
280 | 


--------------------------------------------------------------------------------
/src/tidyms/chem/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Chemistry
 3 | =========
 4 | 
 5 | Provides:
 6 | 
 7 | 1. A Formula object to compute the exact mass and isotopic distribution of molecular formulas.
 8 | 2. A PeriodicTable with element and isotope information.
 9 | 3. A formula generator object to search molecular formulas based on exact mass values.
10 | 4. An EnvelopeScorer that scores the similarity between experimental and theoretical isotopic envelopes.
11 | 
12 | Objects
13 | -------
14 | - PeriodicTable
15 | - Formula
16 | - FormulaGenerator
17 | - EnvelopeScorer
18 | 
19 | Constants
20 | ---------
21 | - EM : electron mass
22 | 
23 | """
24 | 
25 | from ._formula_generator import FormulaGenerator, get_chnops_bounds
26 | from .envelope_tools import EnvelopeScorer, EnvelopeValidator
27 | from .formula import Formula
28 | from .atoms import EM, PeriodicTable
29 | 


--------------------------------------------------------------------------------
/src/tidyms/chem/_envelope_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Utilities to compute isotopic envelopes
  4 | 
  5 | """
  6 | 
  7 | import numpy as np
  8 | from functools import cache
  9 | from scipy.stats import multinomial
 10 | from typing import Dict, Optional, Tuple
 11 | from .atoms import Isotope, PeriodicTable
 12 | from . import utils
 13 | 
 14 | 
 15 | def make_envelope_arrays(
 16 |     isotope: Isotope, n_min: int, n_max: int, max_length: int, p=None
 17 | ) -> Tuple[np.ndarray, np.ndarray]:
 18 |     """
 19 |     Creates an array of exact mass and abundance for homonuclear formulas.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     isotope : Isotope
 24 |     n_min : int
 25 |         Minimum formula coefficient
 26 |     n_max : int
 27 |         Maximum formula coefficient
 28 |     max_length : int
 29 |         Length of the envelope
 30 |     p : array or None, default=None
 31 |         Element abundance. If None, the natural abundance is used.
 32 | 
 33 |     Returns
 34 |     -------
 35 |     M : (n_max - n_min + 1, max_length) array
 36 |         Coefficients exact mass.
 37 |     p : (n_max - n_min + 1, max_length) array
 38 |         Coefficients abundance.
 39 | 
 40 | 
 41 |     """
 42 |     rows = n_max - n_min + 1
 43 |     M_arr = np.zeros((rows, max_length))
 44 |     p_arr = np.zeros((rows, max_length))
 45 |     for k in range(n_min, n_max + 1):
 46 |         Mk, pk = _get_n_atoms_envelope(isotope, k, max_length, p=p)
 47 |         M_arr[k - n_min] = Mk
 48 |         p_arr[k - n_min] = pk
 49 |     return M_arr, p_arr
 50 | 
 51 | 
 52 | def find_formula_envelope(
 53 |     composition: Dict[Isotope, int],
 54 |     max_length: int,
 55 |     p: Optional[Dict[str, np.ndarray]] = None,
 56 |     min_p: float = 1e-10,
 57 | ) -> Tuple[np.ndarray, np.ndarray]:
 58 |     """
 59 |     Computes the isotopic envelope for a formula.
 60 | 
 61 |     """
 62 |     if p is None:
 63 |         p = dict()
 64 | 
 65 |     # initialize an empty envelope for the formula
 66 |     Mf = np.zeros((1, max_length), dtype=float)
 67 |     pf = np.zeros((1, max_length), dtype=float)
 68 |     pf[0, 0] = 1
 69 | 
 70 |     for i, coeff in composition.items():
 71 |         i_p = p.get(i.get_symbol())
 72 |         Mi, pi = _get_n_atoms_envelope(i, coeff, max_length, p=i_p)
 73 |         Mi = Mi.reshape((1, Mi.size))
 74 |         pi = pi.reshape((1, pi.size))
 75 |         Mf, pf = combine_envelopes(Mf, pf, Mi, pi)
 76 |     valid_p_mask = pf >= min_p
 77 |     pf = pf[valid_p_mask].flatten()
 78 |     Mf = Mf[valid_p_mask].flatten()
 79 |     return Mf, pf
 80 | 
 81 | 
 82 | def combine_envelopes(
 83 |     M1: np.ndarray,
 84 |     p1: np.ndarray,
 85 |     M2: np.ndarray,
 86 |     p2: np.ndarray,
 87 | ) -> Tuple[np.ndarray, np.ndarray]:
 88 |     """
 89 |     Combines exact mass and abundance of two envelopes.
 90 | 
 91 |     All arrays must be 2-dimensional and have the same shape.
 92 | 
 93 |     """
 94 |     shape = M1.shape
 95 |     M = np.zeros(shape, dtype=float)
 96 |     p = np.zeros(shape, dtype=float)
 97 |     # Ignore zero division errors when normalizing by pk
 98 |     with np.errstate(divide='ignore', invalid='ignore'):
 99 |         for k in range(shape[1]):
100 |             pk = (p1[:, : k + 1] * p2[:, k::-1]).sum(axis=1)
101 |             k1 = k + 1
102 |             k2 = k
103 |             Mk = (p1[:, :k1] * M1[:, :k1] * p2[:, k2::-1]) + (
104 |                 p1[:, :k1] * M2[:, k2::-1] * p2[:, k2::-1]
105 |             )
106 |             M[:, k] = Mk.sum(axis=1) / pk
107 |             p[:, k] = pk
108 |     np.nan_to_num(M, copy=False)
109 |     return M, p
110 | 
111 | 
112 | def _get_n_atoms_envelope(
113 |     isotope: Isotope, n: int, max_length: int, p: Optional[np.ndarray] = None
114 | ) -> Tuple[np.ndarray, np.ndarray]:
115 |     """
116 |     Computes the nominal mass, exact mass and abundance of n atoms.
117 | 
118 |     If the isotope is the monoisotope and p is ``None``, the natural abundances
119 |     for the element are used.
120 | 
121 |     If the isotope is the monoisotope and custom abundance `p` is provided, the
122 |     envelope is computed using this value instead of the natural abundances.
123 | 
124 |     If the isotopes is not the monoisotope, it is assumed that only this
125 |     isotope contributes to the envelope.
126 | 
127 |     """
128 |     symbol = isotope.get_symbol()
129 |     element = PeriodicTable().get_element(symbol)
130 |     is_monoisotope = isotope.a == element.get_monoisotope().a
131 |     n_isotopes = len(element.isotopes)
132 |     if is_monoisotope and (n_isotopes > 1):
133 |         if n == 0:
134 |             M, p = _get_n_isotopes_envelope(isotope, n, max_length)
135 |         elif p is None:
136 |             M, p = _get_n_atoms_natural_abundance(symbol, n, max_length)
137 |         else:
138 |             m, M, _ = element.get_abundances()
139 |             _validate_abundance(p, m, symbol)
140 |             M, p = _get_n_atoms_envelope_aux(m, M, p, n, max_length)
141 |     else:
142 |         M, p = _get_n_isotopes_envelope(isotope, n, max_length)
143 |     return M, p
144 | 
145 | 
146 | @cache
147 | def _get_n_atoms_natural_abundance(symbol: str, n: int, max_length: int):
148 |     """
149 |     Computes the envelope of n atoms using the natural abundance.
150 | 
151 |     aux function to _get_n_atoms_envelope
152 | 
153 |     """
154 |     m, M, p = PeriodicTable().get_element(symbol).get_abundances()
155 |     return _get_n_atoms_envelope_aux(m, M, p, n, max_length)
156 | 
157 | 
158 | def _get_n_atoms_envelope_aux(
159 |     m: np.ndarray, M: np.ndarray, p: np.ndarray, n: int, max_length: int
160 | ) -> Tuple[np.ndarray, np.ndarray]:
161 |     """
162 |     Computes the envelope of n atoms.
163 | 
164 |     aux function to _get_n_atoms_envelope.
165 | 
166 |     """
167 |     n_isotopes = p.size
168 |     # find combinations of isotopes that sum n
169 |     combinations = _find_n_isotope_combination(n_isotopes, n)
170 | 
171 |     # find m, M and p for each combination of isotopes
172 |     multinomial_dist = multinomial(n, p)
173 |     m = np.matmul(combinations, m)
174 |     M = np.matmul(combinations, M)
175 |     p = multinomial_dist.pmf(combinations)
176 | 
177 |     # sort by exact mass
178 |     sorted_index = np.argsort(M)
179 |     m, M, p = m[sorted_index], M[sorted_index], p[sorted_index]
180 | 
181 |     # merge values with the same nominal mass
182 |     _, first_occurrence = np.unique(m, return_index=True)
183 |     m_unique = np.zeros(max_length, dtype=m.dtype)
184 |     M_unique = np.zeros(max_length, dtype=M.dtype)
185 |     p_unique = np.zeros(max_length, dtype=p.dtype)
186 |     # add the length of m_unique to include all nominal mass values
187 |     n_unique = first_occurrence.size
188 |     first_occurrence = list(first_occurrence)
189 |     first_occurrence.append(m.size)
190 |     m0 = m[0]
191 |     for k in range(max_length):
192 |         if k < n_unique:
193 |             start = first_occurrence[k]
194 |             end = first_occurrence[k + 1]
195 |             mk = m[start]
196 |             i = mk - m0
197 |             if i < max_length:
198 |                 m_unique[i] = mk
199 |                 pk = np.sum(p[start:end])
200 |                 p_unique[i] = pk
201 |                 M_unique[i] = np.sum(M[start:end] * p[start:end]) / pk
202 |     p_unique = p_unique / np.sum(p_unique)
203 |     return M_unique, p_unique
204 | 
205 | 
206 | def _fill_missing_nominal(
207 |     m: np.ndarray, M: np.ndarray, p: np.ndarray, max_length: int
208 | ) -> Tuple[np.ndarray, np.ndarray]:
209 |     rel_m = m - m[0]
210 |     dm = np.arange(max_length)
211 |     M_filled = np.zeros(max_length, dtype=M.dtype)
212 |     p_filled = np.zeros(max_length, dtype=p.dtype)
213 |     if not np.array_equal(rel_m, dm):
214 |         for k, rel_m_k in enumerate(rel_m):
215 |             if 0 <= rel_m_k < max_length:
216 |                 M_filled[rel_m_k] = M[k]
217 |                 p_filled[rel_m_k] = p[k]
218 |             else:
219 |                 break
220 |         M, p = M_filled, p_filled
221 |     return M, p
222 | 
223 | 
224 | def _find_n_isotope_combination(n_isotopes, n):
225 |     """
226 |     Finds combinations of isotopes such that the sum is n.
227 | 
228 |     aux function to _find_n_atoms_abundances.
229 | 
230 |     """
231 |     n_ranges = [range(x) for x in ([n + 1] * n_isotopes)]
232 |     combinations = utils.cartesian_product(*n_ranges).astype(int)
233 |     valid_combinations = combinations.sum(axis=1) == n
234 |     combinations = combinations[valid_combinations, :]
235 |     return combinations
236 | 
237 | 
238 | def _validate_abundance(p: np.ndarray, m: np.ndarray, symbol: str):
239 |     """
240 |     Checks that user-created abundances are non-negative, normalized to 1 and
241 |     has the same length as the number of stable isotopes.
242 | 
243 |     aux function to _get_n_atoms_envelope.
244 | 
245 |     """
246 |     is_all_non_negative = (p >= 0.0).all()
247 |     is_normalized = np.isclose(p.sum(), 1.0)
248 |     is_same_size = p.size == m.size
249 |     if not is_same_size:
250 |         msg = "{} has {} stable isotopes. `p` must have the same size."
251 |         raise ValueError(msg.format(symbol, m.size))
252 |     elif not (is_normalized and is_all_non_negative):
253 |         msg = "`p` elements must be non-negative and their sum normalized to 1."
254 |         raise ValueError(msg)
255 | 
256 | 
257 | def _get_n_isotopes_envelope(
258 |     isotope: Isotope, n: int, max_length: int
259 | ) -> Tuple[np.ndarray, np.ndarray]:
260 |     """
261 |     Creates the isotopic envelope for n isotopes.
262 | 
263 |     aux function to _get_n_atoms_envelope.
264 | 
265 |     """
266 |     M = np.zeros(max_length, dtype=float)
267 |     p = np.zeros(max_length, dtype=float)
268 |     M[0] = isotope.m * n
269 |     p[0] = 1.0
270 |     return M, p
271 | 


--------------------------------------------------------------------------------
/src/tidyms/chem/atoms.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tools for working with Isotopes and Elements.
  3 | 
  4 | Objects
  5 | -------
  6 | - Element
  7 | - Isotope
  8 | - PeriodicTable
  9 | 
 10 | Constants
 11 | ---------
 12 | - EM: Mass of the electron.
 13 | 
 14 | Exceptions
 15 | ----------
 16 | - InvalidIsotope
 17 | 
 18 | """
 19 | import json
 20 | import numpy as np
 21 | import os.path
 22 | from string import digits
 23 | from typing import Dict, Final, Tuple, Union
 24 | 
 25 | 
 26 | EM: Final[float] = 0.00054858  # electron mass
 27 | 
 28 | 
 29 | class Isotope:
 30 |     """
 31 |     Representation of an Isotope.
 32 | 
 33 |     Attributes
 34 |     ----------
 35 |     z: int
 36 |         Atomic number
 37 |     n: int
 38 |         Neutron number
 39 |     a: int
 40 |         Mass number
 41 |     m: float
 42 |         Exact mass.
 43 |     defect: float
 44 |         Difference between the exact mass and mass number.
 45 |     abundance: float
 46 |         Relative abundance of the isotope.
 47 | 
 48 |     """
 49 | 
 50 |     __slots__ = ("z", "n", "a", "m", "defect", "abundance")
 51 | 
 52 |     def __init__(self, z: int, a: int, m: float, abundance: float):
 53 |         self.z = z
 54 |         self.n = a - z
 55 |         self.a = a
 56 |         self.m = m
 57 |         self.defect = m - a
 58 |         self.abundance = abundance
 59 | 
 60 |     def __str__(self):
 61 |         return "{}{}".format(self.a, self.get_symbol())
 62 | 
 63 |     def __repr__(self):
 64 |         return "Isotope({})".format(str(self))
 65 | 
 66 |     def get_element(self) -> "Element":
 67 |         return PeriodicTable().get_element(self.z)
 68 | 
 69 |     def get_symbol(self) -> str:
 70 |         return self.get_element().symbol
 71 | 
 72 | 
 73 | class Element(object):
 74 |     """
 75 |     Representation of a chemical element.
 76 | 
 77 |     Attributes
 78 |     ----------
 79 |     name : str
 80 |         Element name.
 81 |     symbol : str
 82 |         Element symbol
 83 |     isotopes : Dict[int, Isotope]
 84 |         Mapping from mass number to an isotope
 85 |     z : int
 86 |         Atomic number.
 87 |     nominal_mass : int
 88 |         Mass number of the most abundant isotope
 89 | 
 90 |     """
 91 | 
 92 |     def __init__(self, symbol: str, name: str, isotopes: Dict[int, Isotope]):
 93 |         self.name = name
 94 |         self.symbol = symbol
 95 |         self.isotopes = isotopes
 96 |         monoisotope = self.get_monoisotope()
 97 |         self.z = monoisotope.z
 98 |         self.nominal_mass = monoisotope.a
 99 |         self.monoisotopic_mass = monoisotope.m
100 |         self.mass_defect = self.monoisotopic_mass - self.nominal_mass
101 | 
102 |     def __repr__(self):
103 |         return "Element({})".format(self.symbol)
104 | 
105 |     def __str__(self):  # pragma: no cover
106 |         return self.symbol
107 | 
108 |     def get_abundances(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
109 |         """
110 |         Returns the Mass number, exact mass and abundance of each Isotope.
111 | 
112 |         Returns
113 |         -------
114 |         m: array[int]
115 |             Mass number of each isotope.
116 |         M: array[float]
117 |             Exact mass of each isotope.
118 |         p: array[float]
119 |             Abundance of each isotope.
120 | 
121 |         """
122 |         isotopes = list(self.isotopes.values())
123 |         m = np.array([x.a for x in isotopes], dtype=int)
124 |         M = np.array([x.m for x in isotopes])
125 |         p = np.array([x.abundance for x in isotopes])
126 |         return m, M, p
127 | 
128 |     def get_mmi(self) -> Isotope:
129 |         """
130 |         Returns the isotope with the lowest atomic mass.
131 | 
132 |         """
133 |         return min(self.isotopes.values(), key=lambda x: x.a)
134 | 
135 |     def get_monoisotope(self) -> Isotope:
136 |         """
137 |         Returns the most abundant isotope.
138 | 
139 |         """
140 |         return max(self.isotopes.values(), key=lambda x: x.abundance)
141 | 
142 | 
143 | def PeriodicTable():
144 |     """
145 |     Reference the PeriodicTable object.
146 | 
147 |     Examples
148 |     --------
149 |     >>> import tidyms as ms
150 |     >>> ptable = ms.chem.PeriodicTable()
151 | 
152 |     """
153 |     if _PeriodicTable.instance is None:
154 |         _PeriodicTable.instance = _PeriodicTable()
155 |     return _PeriodicTable.instance
156 | 
157 | 
158 | class _PeriodicTable:
159 |     """
160 |     Periodic Table representation. Contains element and isotope information.
161 | 
162 |     Methods
163 |     -------
164 |     get_element
165 |     get_isotope
166 | 
167 |     """
168 | 
169 |     instance = None
170 | 
171 |     def __init__(self):
172 |         self._symbol_to_element = _make_periodic_table()
173 |         self._z_to_element = {v.z: v for v in self._symbol_to_element.values()}
174 |         self._za_to_isotope = dict()
175 |         self._str_to_isotope = dict()
176 |         for el_str in self._symbol_to_element:
177 |             el = self._symbol_to_element[el_str]
178 |             for isotope in el.isotopes.values():
179 |                 self._za_to_isotope[(isotope.z, isotope.a)] = isotope
180 |                 self._str_to_isotope[str(isotope.a) + el_str] = isotope
181 | 
182 |     def get_element(self, element: Union[str, int]) -> Element:
183 |         """
184 |         Returns an Element object using its symbol or atomic number.
185 | 
186 |         Parameters
187 |         ----------
188 |         element : str or int
189 |             element symbol or atomic number.
190 | 
191 |         Returns
192 |         -------
193 |         Element
194 | 
195 |         Examples
196 |         --------
197 |         >>> import tidyms as ms
198 |         >>> ptable = ms.chem.PeriodicTable()
199 |         >>> h = ptable.get_element("H")
200 |         >>> c = ptable.get_element(6)
201 | 
202 |         """
203 |         if isinstance(element, int):
204 |             element = self._z_to_element[element]
205 |         else:
206 |             element = self._symbol_to_element[element]
207 |         return element
208 | 
209 |     def __iter__(self):
210 |         for el in self._symbol_to_element.values():
211 |             yield el
212 | 
213 |     def get_isotope(self, x: str, copy: bool = False) -> Isotope:
214 |         """
215 |         Returns an isotope object from a string representation.
216 | 
217 |         Parameters
218 |         ----------
219 |         x : str
220 |             A string representation of an isotope. If only the symbol is
221 |             provided in the string, the monoisotope is returned.
222 |         copy : bool
223 |             If True creates a new Isotope object.
224 | 
225 |         Returns
226 |         -------
227 |         Isotope
228 | 
229 |         Examples
230 |         --------
231 |         >>> import tidyms as ms
232 |         >>> ptable = ms.chem.PeriodicTable()
233 |         >>> d = ptable.get_isotope("2H")
234 |         >>> cl35 = ptable.get_isotope("Cl")
235 | 
236 |         """
237 |         try:
238 |             if x[0] in digits:
239 |                 isotope = self._str_to_isotope[x]
240 |             else:
241 |                 isotope = self.get_element(x).get_monoisotope()
242 |             if copy:
243 |                 isotope = Isotope(isotope.z, isotope.a, isotope.m, isotope.abundance)
244 |             return isotope
245 |         except KeyError:
246 |             msg = "{} is not a valid input.".format(x)
247 |             raise InvalidIsotope(msg)
248 | 
249 | 
250 | def _make_periodic_table() -> Dict[str, Element]:
251 |     this_dir, _ = os.path.split(__file__)
252 |     elements_path = os.path.join(this_dir, "elements.json")
253 |     with open(elements_path, "r") as fin:
254 |         element_data = json.load(fin)
255 | 
256 |     isotopes_path = os.path.join(this_dir, "isotopes.json")
257 |     with open(isotopes_path, "r") as fin:
258 |         isotope_data = json.load(fin)
259 | 
260 |     periodic_table = dict()
261 |     for element in isotope_data:
262 |         element_isotopes = isotope_data[element]
263 |         isotopes = {x["a"]: Isotope(**x) for x in element_isotopes}
264 |         name = element_data[element]
265 |         periodic_table[element] = Element(element, name, isotopes)
266 |     return periodic_table
267 | 
268 | 
269 | class InvalidIsotope(ValueError):
270 |     pass
271 | 


--------------------------------------------------------------------------------
/src/tidyms/chem/elements.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "Xx": "Dummy",
  3 |   "H": "Hydrogen",
  4 |   "He": "Helium",
  5 |   "Li": "Lithium",
  6 |   "Be": "Beryllium",
  7 |   "B": "Boron",
  8 |   "C": "Carbon",
  9 |   "N": "Nitrogen",
 10 |   "O": "Oxygen",
 11 |   "F": "Fluorine",
 12 |   "Ne": "Neon",
 13 |   "Na": "Sodium",
 14 |   "Mg": "Magnesium",
 15 |   "Al": "Aluminium",
 16 |   "Si": "Silicon",
 17 |   "P": "Phosphorus",
 18 |   "S": "Sulfur",
 19 |   "Cl": "Chlorine",
 20 |   "Ar": "Argon",
 21 |   "K": "Potassium",
 22 |   "Ca": "Calcium",
 23 |   "Sc": "Scandium",
 24 |   "Ti": "Titanium",
 25 |   "V": "Vanadium",
 26 |   "Cr": "Chromium",
 27 |   "Mn": "Manganese",
 28 |   "Fe": "Iron",
 29 |   "Co": "Cobalt",
 30 |   "Ni": "Nickel",
 31 |   "Cu": "Copper",
 32 |   "Zn": "Zinc",
 33 |   "Ga": "Gallium",
 34 |   "Ge": "Germanium",
 35 |   "As": "Arsenic",
 36 |   "Se": "Selenium",
 37 |   "Br": "Bromine",
 38 |   "Kr": "Krypton",
 39 |   "Rb": "Rubidium",
 40 |   "Sr": "Strontium",
 41 |   "Y": "Yttrium",
 42 |   "Zr": "Zirconium",
 43 |   "Nb": "Niobium",
 44 |   "Mo": "Molybdenum",
 45 |   "Tc": "Technetium",
 46 |   "Ru": "Ruthenium",
 47 |   "Rh": "Rhodium",
 48 |   "Pd": "Palladium",
 49 |   "Ag": "Silver",
 50 |   "Cd": "Cadmium",
 51 |   "In": "Indium",
 52 |   "Sn": "Tin",
 53 |   "Sb": "Antimony",
 54 |   "Te": "Tellurium",
 55 |   "I": "Iodine",
 56 |   "Xe": "Xenon",
 57 |   "Cs": "Caesium",
 58 |   "Ba": "Barium",
 59 |   "La": "Lanthanum",
 60 |   "Ce": "Cerium",
 61 |   "Pr": "Praseodymium",
 62 |   "Nd": "Neodymium",
 63 |   "Pm": "Promethium",
 64 |   "Sm": "Samarium",
 65 |   "Eu": "Europium",
 66 |   "Gd": "Gadolinium",
 67 |   "Tb": "Terbium",
 68 |   "Dy": "Dysprosium",
 69 |   "Ho": "Holmium",
 70 |   "Er": "Erbium",
 71 |   "Tm": "Thulium",
 72 |   "Yb": "Ytterbium",
 73 |   "Lu": "Lutetium",
 74 |   "Hf": "Hafnium",
 75 |   "Ta": "Tantalum",
 76 |   "W": "Tungsten",
 77 |   "Re": "Rhenium",
 78 |   "Os": "Osmium",
 79 |   "Ir": "Iridium",
 80 |   "Pt": "Platinum",
 81 |   "Au": "Gold",
 82 |   "Hg": "Mercury",
 83 |   "Tl": "Thallium",
 84 |   "Pb": "Lead",
 85 |   "Bi": "Bismuth",
 86 |   "Po": "Polonium",
 87 |   "At": "Astatine",
 88 |   "Rn": "Radon",
 89 |   "Fr": "Francium",
 90 |   "Ra": "Radium",
 91 |   "Ac": "Actinium",
 92 |   "Th": "Thorium",
 93 |   "Pa": "Protactinium",
 94 |   "U": "Uranium",
 95 |   "Np": "Neptunium",
 96 |   "Pu": "Plutonium",
 97 |   "Am": "Americium",
 98 |   "Cm": "Curium",
 99 |   "Bk": "Berkelium",
100 |   "Cf": "Californium",
101 |   "Es": "Einsteinium",
102 |   "Fm": "Fermium",
103 |   "Md": "Mendelevium",
104 |   "No": "Nobelium",
105 |   "Lr": "Lawrencium",
106 |   "Rf": "Rutherfordium",
107 |   "Db": "Dubnium",
108 |   "Sg": "Seaborgium",
109 |   "Bh": "Bohrium",
110 |   "Hs": "Hassium",
111 |   "Mt": "Meitnerium",
112 |   "Ds": "Darmstadtium",
113 |   "Rg": "Roentgenium",
114 |   "Cn": "Copernicium",
115 |   "Uut": "Ununtrium",
116 |   "Fl": "Flerovium",
117 |   "Uup": "Ununpentium",
118 |   "Lv": "Livermorium",
119 |   "Uus": "Ununseptium",
120 |   "Uuo": "Ununoctium"
121 | }


--------------------------------------------------------------------------------
/src/tidyms/chem/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | functions and classes used in different modules
 4 | """
 5 | 
 6 | import numpy as np
 7 | 
 8 | 
 9 | def cartesian_product(*args):
10 |     res = None
11 |     for x in args:
12 |         if res is None:
13 |             # initialize cartesian product array
14 |             res = np.array(x)
15 |             res = res.reshape((res.size, 1))
16 |         else:
17 |             x = np.array(x)
18 |             row, col = res.shape
19 |             new_res_shape = (row * x.size, col + 1)
20 |             new_res = np.zeros(shape=new_res_shape, dtype=res.dtype)
21 |             ind = np.repeat(np.arange(row), x.size)
22 |             new_col = np.tile(x, row)
23 |             new_res[:, :col] = res[ind]
24 |             new_res[:, -1] = new_col
25 |             res = new_res
26 |     return res
27 | 


--------------------------------------------------------------------------------
/test_requirements.txt:
--------------------------------------------------------------------------------
1 | pytest>=7.1.0
2 | pytest-cov>=3.0.0


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/griquelme/tidyms/ad9356a099f367076f745406be23bb4c50003239/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from tidyms.simulation import simulate_dataset
 4 | from tidyms.container import DataContainer
 5 | from tidyms import fileio
 6 | from tidyms.utils import get_tidyms_path
 7 | import numpy as np
 8 | import pytest
 9 | import os
10 | 
11 | 
12 | # simulated data used for tests
13 | def pytest_sessionstart(session):
14 |     for dataset in fileio.list_available_datasets(False):
15 |         fileio.download_dataset(dataset)
16 | 
17 | 
18 | @pytest.fixture
19 | def data_container_with_order():
20 |     population = {"healthy": 20, "disease": 35}
21 |     mean = {"healthy": np.array([50, 100, 150]), "disease": np.array([150, 200, 300])}
22 |     cov = {"healthy": np.array([1, 1, 1]), "disease": np.array([2, 2, 2])}
23 |     blank_contribution = np.array([3, 5, 10])
24 |     mz = np.array([100, 200, 300])
25 |     rt = np.array([50, 60, 70])
26 |     dc = simulate_dataset(
27 |         population, mean, cov, mz, rt, blank_contribution, prepend_blank=1, append_blank=1
28 |     )
29 |     return dc
30 | 
31 | 
32 | @pytest.fixture
33 | def data_container_with_order_single_qc():
34 |     population = {"healthy": 20, "disease": 35}
35 |     mean = {"healthy": np.array([50, 100, 150]), "disease": np.array([150, 200, 300])}
36 |     cov = {"healthy": np.array([1, 1, 1]), "disease": np.array([2, 2, 2])}
37 |     blank_contribution = np.array([3, 5, 10])
38 |     mz = np.array([100, 200, 300])
39 |     rt = np.array([50, 60, 70])
40 |     dc = simulate_dataset(
41 |         population,
42 |         mean,
43 |         cov,
44 |         mz,
45 |         rt,
46 |         blank_contribution,
47 |         prepend_blank=1,
48 |         append_blank=1,
49 |         triple_qc=False,
50 |     )
51 |     return dc
52 | 
53 | 
54 | @pytest.fixture
55 | def data_container_without_order(data_container_with_order):
56 |     dc = data_container_with_order
57 |     dm = dc.data_matrix.copy()
58 |     sm = dc.sample_metadata.copy()
59 |     sm.pop("order")
60 |     sm.pop("batch")
61 |     fm = dc.feature_metadata.copy()
62 |     mapping = {k: v for k, v in dc.mapping.items() if v is not None}
63 |     return DataContainer(dm, fm, sm, mapping)
64 | 
65 | 
66 | @pytest.fixture
67 | def centroid_mzml():
68 |     cache_path = get_tidyms_path()
69 |     dataset_name = "test-raw-data"
70 |     filename = "centroid-data-zlib-indexed-compressed.mzML"
71 |     data_path = os.path.join(cache_path, dataset_name, filename)
72 |     ms_data = fileio.MSData.create_MSData_instance(data_path, ms_mode="profile")
73 |     return ms_data
74 | 
75 | 
76 | @pytest.fixture
77 | def profile_mzml():
78 |     cache_path = get_tidyms_path()
79 |     filename = "profile-data-zlib-indexed-compressed.mzML"
80 |     data_path = os.path.join(cache_path, "test-raw-data", filename)
81 |     ms_data = fileio.MSData.create_MSData_instance(data_path, ms_mode="profile")
82 |     return ms_data
83 | 


--------------------------------------------------------------------------------
/tests/integration/test_assay_real_data.py:
--------------------------------------------------------------------------------
 1 | import tidyms as ms
 2 | import numpy as np
 3 | import pytest
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def assay(tmpdir) -> ms.Assay:
 9 |     tidyms_path = ms.fileio.get_tidyms_path()
10 |     data_path = Path(tidyms_path).joinpath("test-nist-raw-data")
11 |     assay_path = Path(tmpdir).joinpath("test-assay")
12 |     return ms.Assay(assay_path, data_path)
13 | 
14 | 
15 | @pytest.fixture
16 | def detect_features_params() -> dict:
17 |     # a list of known m/z values to reduce computing time
18 |     mz_list = np.array([144.081, 146.060, 195.086, 189.0734, 205.0967, 188.071])
19 |     return {
20 |         "tolerance": 0.015,
21 |         "min_intensity": 5000,
22 |         "targeted_mz": mz_list,
23 |     }
24 | 
25 | 
26 | @pytest.fixture
27 | def extract_features_params() -> dict:
28 |     return {"store_smoothed": True}
29 | 
30 | 
31 | def test_detect_features(assay, detect_features_params):
32 |     assay.detect_features(**detect_features_params)
33 |     assert True
34 | 
35 | 
36 | def test_extract_features(
37 |     assay,
38 |     detect_features_params,
39 |     extract_features_params
40 | ):
41 |     assay.detect_features(**detect_features_params)
42 |     assay.extract_features(**extract_features_params)
43 |     assert True
44 | 
45 | 
46 | def test_describe_features(
47 |     assay,
48 |     detect_features_params,
49 |     extract_features_params
50 | ):
51 |     assay.detect_features(**detect_features_params)
52 |     assay.extract_features(**extract_features_params)
53 |     assay.describe_features()
54 |     assert True
55 | 
56 | 
57 | def test_build_feature_table(
58 |     assay,
59 |     detect_features_params,
60 |     extract_features_params
61 | ):
62 |     assay.detect_features(**detect_features_params)
63 |     assay.extract_features(**extract_features_params)
64 |     assay.describe_features()
65 |     assay.build_feature_table()
66 |     assert True
67 | 
68 | 
69 | def test_match_features(
70 |     assay,
71 |     detect_features_params,
72 |     extract_features_params
73 | ):
74 |     assay.detect_features(**detect_features_params)
75 |     assay.extract_features(**extract_features_params)
76 |     assay.describe_features()
77 |     assay.build_feature_table()
78 |     assay.match_features()
79 |     assert True
80 | 
81 | 
82 | def test_build_data_matrix(
83 |     assay,
84 |     detect_features_params,
85 |     extract_features_params
86 | ):
87 |     assay.detect_features(**detect_features_params)
88 |     assay.extract_features(**extract_features_params)
89 |     assay.describe_features()
90 |     assay.build_feature_table()
91 |     assay.match_features()
92 |     assay.make_data_matrix()
93 |     assert True


--------------------------------------------------------------------------------
/tests/integration/test_real_raw_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test lcms and fileio functionality with real data.
 3 | 
 4 | """
 5 | 
 6 | import tidyms as ms
 7 | import numpy as np
 8 | import pytest
 9 | import os
10 | 
11 | 
12 | @pytest.fixture
13 | def ms_data_centroid() -> ms.MSData:
14 |     tidyms_path = ms.fileio.get_tidyms_path()
15 |     filename = "centroid-data-zlib-indexed-compressed.mzML"
16 |     data_path = os.path.join(tidyms_path, "test-raw-data", filename)
17 |     return ms.MSData.create_MSData_instance(data_path)
18 | 
19 | 
20 | def test_ms_data_invalid_ms_mode_setter(ms_data_centroid):
21 |     with pytest.raises(ValueError):
22 |         ms_data_centroid.ms_mode = "invalid-mode"
23 | 
24 | 
25 | def test_ms_data_invalid_instrument_setter(ms_data_centroid):
26 |     with pytest.raises(ValueError):
27 |         ms_data_centroid.instrument = "invalid-instrument"
28 | 
29 | 
30 | def test_ms_data_invalid_separation_setter(ms_data_centroid):
31 |     with pytest.raises(ValueError):
32 |         ms_data_centroid.separation = "invalid-separation"
33 | 
34 | 
35 | def test_make_chromatogram_ms_level_1(ms_data_centroid):
36 |     mz = np.array([205.098, 524.37, 188.07])   # some m/z observed in the data
37 |     chromatograms = ms.make_chromatograms(ms_data_centroid, mz)
38 |     rt = list()
39 |     for _, sp in ms_data_centroid.get_spectra_iterator(ms_level=1):
40 |         rt.append(sp.time)
41 |     rt = np.array(rt)
42 |     for c in chromatograms:
43 |         assert np.array_equal(rt, c.time)
44 |         assert c.time.size == c.spint.size
45 | 
46 | 
47 | def test_ms_data_get_spectrum(ms_data_centroid):
48 |     ms_data_centroid.get_spectrum(0)
49 |     assert True
50 | 
51 | 
52 | def test_make_tic_ms_level_1(ms_data_centroid):
53 |     tic = ms.make_tic(ms_data_centroid, ms_level=1)
54 |     rt = list()
55 |     for _, sp in ms_data_centroid.get_spectra_iterator(ms_level=1):
56 |         rt.append(sp.time)
57 |     rt = np.array(rt)
58 |     assert np.array_equal(rt, tic.time)
59 |     assert tic.time.size == tic.spint.size
60 | 
61 | 
62 | def test_make_chromatogram_ms_level_2(ms_data_centroid):
63 |     mz = np.array([205.098, 524.37, 188.07])   # some m/z observed in the data
64 |     ms_level = 2
65 |     chromatograms = ms.make_chromatograms(
66 |         ms_data_centroid, mz, ms_level=ms_level)
67 |     rt = list()
68 |     for _, sp in ms_data_centroid.get_spectra_iterator(ms_level=ms_level):
69 |         rt.append(sp.time)
70 |     rt = np.array(rt)
71 |     for c in chromatograms:
72 |         assert np.array_equal(rt, c.time)
73 |         assert c.time.size == c.spint.size
74 | 
75 | 
76 | def test_make_roi(ms_data_centroid):
77 |     roi_list = ms.make_roi(ms_data_centroid)
78 |     for r in roi_list:
79 |         # The three arrays must have the same size
80 |         assert r.time.size == r.spint.size
81 |         assert r.time.size == r.scan.size
82 | 
83 | 
84 | def test_accumulate_spectra(ms_data_centroid):
85 |     sp = ms.accumulate_spectra(ms_data_centroid, start_time=20, end_time=30)
86 |     assert sp.mz.size == sp.spint.size
87 | 


--------------------------------------------------------------------------------
/tests/unit/annotation/test_annotation.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | 
  4 | from tidyms.annotation import annotation
  5 | from tidyms.raw_data_utils import make_roi
  6 | from tidyms.fileio import MSData_simulated
  7 | from tidyms.lcms import Peak
  8 | from tidyms.chem import Formula
  9 | 
 10 | 
 11 | @pytest.fixture
 12 | def annotation_tools_params():
 13 |     bounds = {
 14 |         "C": (0, 50),
 15 |         "H": (0, 100),
 16 |         "O": (0, 20),
 17 |         "N": (0, 20),
 18 |         "Cl": (0, 2),
 19 |         "B": (0, 1),
 20 |     }
 21 |     params = {
 22 |         "bounds": bounds,
 23 |         "max_mass": 2500,
 24 |         "max_length": 10,
 25 |         "max_charge": 3,
 26 |         "min_M_tol": 0.005,
 27 |         "max_M_tol": 0.01,
 28 |         "p_tol": 0.05,
 29 |         "min_similarity": 0.9,
 30 |         "min_p": 0.01,
 31 |     }
 32 |     return params
 33 | 
 34 | 
 35 | def test__annotate_empty_feature_list(annotation_tools_params):
 36 |     tools = annotation.create_annotation_tools(**annotation_tools_params)
 37 |     feature_list = list()
 38 |     annotation.annotate(feature_list, *tools)
 39 | 
 40 | 
 41 | @pytest.fixture
 42 | def compound_data():
 43 |     compounds = [
 44 |         "[C10H20O2]-",
 45 |         "[C10H20BO3]-",
 46 |         "[C20H40BO5]2-",
 47 |         "[C18H19N2O3]-",
 48 |         "[C18H20N2O3Cl]-",
 49 |         "[C10H20Cl]-",
 50 |     ]
 51 |     rt_list = [50, 75, 150, 200, 200, 175]
 52 |     amp_list = [10000, 20000, 30000, 25000, 25000, 20000]
 53 |     return compounds, rt_list, amp_list
 54 | 
 55 | 
 56 | @pytest.fixture
 57 | def feature_list(compound_data) -> list[Peak]:
 58 |     compounds, rt_list, amp_list = compound_data
 59 |     mz_grid = np.linspace(100, 1200, 20000)
 60 |     rt_grid = np.arange(300)
 61 |     rt_params = list()
 62 |     mz_params = list()
 63 |     width = 4
 64 |     for comp, c_amp, c_rt in zip(compounds, amp_list, rt_list):
 65 |         f = Formula(comp)
 66 |         cM, cp = f.get_isotopic_envelope(4)
 67 |         cmz = [[x, y] for x, y in zip(cM, cp)]
 68 |         crt = [[c_rt, width, c_amp] for _ in cM]
 69 |         rt_params.append(crt)
 70 |         mz_params.append(cmz)
 71 |     mz_params = np.vstack(mz_params)
 72 |     rt_params = np.vstack(rt_params)
 73 |     ms_data = MSData_simulated(mz_grid, rt_grid, mz_params, rt_params, noise=0.025)
 74 | 
 75 |     roi_list = make_roi(ms_data, tolerance=0.01)
 76 |     ft_list = list()
 77 |     for k, r in enumerate(roi_list):
 78 |         r.extract_features()
 79 |         r.index = k
 80 |         if r.features:
 81 |             for j, ft in enumerate(r.features):
 82 |                 ft.index = j
 83 |             ft_list.extend(r.features)
 84 |     return ft_list
 85 | 
 86 | 
 87 | def test_annotate(feature_list, annotation_tools_params):
 88 |     tools = annotation.create_annotation_tools(**annotation_tools_params)
 89 |     annotation.annotate(feature_list, *tools)
 90 | 
 91 |     # group features by isotopologue label.
 92 |     annotation_check = dict()
 93 |     for ft in feature_list:
 94 |         group_list = annotation_check.setdefault(ft.annotation.isotopologue_label, list())
 95 |         group_list.append(ft)
 96 |     annotation_check.pop(-1)
 97 |     assert len(annotation_check) == 6
 98 |     for v in annotation_check.values():
 99 |         assert len(v) == 4  # features where generated with 4 isotopologues.
100 | 


--------------------------------------------------------------------------------
/tests/unit/annotation/test_envelope_finder.py:
--------------------------------------------------------------------------------
  1 | from tidyms.annotation import envelope_finder as ef
  2 | from tidyms.annotation.annotation_data import AnnotationData
  3 | from tidyms.chem import PeriodicTable
  4 | from tidyms.chem import Formula
  5 | from tidyms.lcms import LCTrace, Peak
  6 | import pytest
  7 | import numpy as np
  8 | from collections.abc import Sequence
  9 | 
 10 | 
 11 | @pytest.fixture
 12 | def formulas():
 13 |     formulas = {
 14 |         "cho": [
 15 |             "C27H34O9",
 16 |             "C62H120O6",
 17 |             "C59H114O6",
 18 |             "C62H120O6",
 19 |             "C56H42O10",
 20 |             "C17H20O4",
 21 |             "C54H104O6",
 22 |             "C48H92O6",
 23 |             "C52H100O6",
 24 |             "C54H104O6",
 25 |             "C47H90O6",
 26 |             "C50H96O6",
 27 |             "C56H108O6",
 28 |             "C21H19O13",
 29 |             "C57H94O6",
 30 |             "C58H112O6",
 31 |             "C64H124O6",
 32 |             "C24H20O8",
 33 |             "C17H12O6",
 34 |             "C61H118O6",
 35 |             "C47H90O6",
 36 |             "C6H12O6",
 37 |             "C63H106O6",
 38 |             "C40H52O4",
 39 |             "C61H118O6",
 40 |             "C61H118O6",
 41 |             "C57H96O6",
 42 |             "C37H72O5",
 43 |             "C28H44O2",
 44 |             "C29H24O12",
 45 |             "C51H98O6",
 46 |             "C39H72O5",
 47 |             "C46H78O7",
 48 |             "C54H104O6",
 49 |             "C63H110O6",
 50 |             "C21H18O13",
 51 |             "C53H102O6",
 52 |             "C62H120O6",
 53 |             "C59H114O6",
 54 |             "C41H78O6",
 55 |             "C25H30O6",
 56 |             "C51H98O6",
 57 |             "C53H102O6",
 58 |             "C43H68O13",
 59 |             "C37H72O5",
 60 |             "C59H114O6",
 61 |             "C15H12O4",
 62 |             "C16H18O4",
 63 |             "C61H110O6",
 64 |             "C58H112O6",
 65 |         ],
 66 |         "chnops": [
 67 |             "C41H80NO8P",
 68 |             "C54H104O6",
 69 |             "C27H40O2",
 70 |             "C24H26O12",
 71 |             "C55H106O6",
 72 |             "C45H80O16P2",
 73 |             "C50H96O6",
 74 |             "C8H13NO",
 75 |             "C35H36O15",
 76 |             "C48H92O6",
 77 |             "C63H98O6",
 78 |             "C15H14O5",
 79 |             "C18H23N3O6",
 80 |             "C44H80NO8P",
 81 |             "C47H90O6",
 82 |             "C47H84O16P2",
 83 |             "C14H14O4",
 84 |             "C46H80NO10P",
 85 |             "C35H64O9",
 86 |             "C51H98O6",
 87 |             "C6H12O6",
 88 |             "C26H34O7",
 89 |             "C17H18O4",
 90 |             "C6H8O9S",
 91 |             "C63H100O6",
 92 |             "C51H98O6",
 93 |             "C6H12O",
 94 |             "C50H96O6",
 95 |             "C56H108O6",
 96 |             "C61H114O6",
 97 |             "C57H110O6",
 98 |             "C44H76NO8P",
 99 |             "C63H110O6",
100 |             "C41H71O8P",
101 |             "C16H16O10",
102 |             "C21H20O15",
103 |             "C4H6O3",
104 |             "C16H18O9",
105 |             "C51H98O6",
106 |             "C57H94O6",
107 |             "C4H9NO2",
108 |             "C56H108O6",
109 |             "C6H8O7",
110 |             "C57H98O6",
111 |             "C63H110O6",
112 |             "C58H112O6",
113 |             "C12H16O7S",
114 |             "C27H30O12",
115 |             "C26H28O16",
116 |             "C27H38O12",
117 |         ],
118 |     }
119 |     return formulas
120 | 
121 | 
122 | @pytest.fixture
123 | def elements():
124 |     elements = {"cho": ["C", "H", "O"], "chnops": ["C", "H", "N", "O", "P", "S"]}
125 |     return elements
126 | 
127 | 
128 | def create_feature_list_from_formula(f_str: str) -> Sequence[Peak]:
129 |     f = Formula(f_str)
130 |     M, _ = f.get_isotopic_envelope()
131 |     if f.charge:
132 |         mz = M / abs(f.charge)
133 |     else:
134 |         mz = M
135 |     feature_list = list()
136 |     for k_mz in mz:
137 |         size = 30
138 |         time = np.linspace(0, size, size)
139 |         scan = np.arange(size)
140 |         spint = np.ones(size)
141 |         roi = LCTrace(time, spint, spint * k_mz, scan)
142 |         peak = Peak(10, 15, 20, roi)
143 |         feature_list.append(peak)
144 |     return feature_list
145 | 
146 | 
147 | @pytest.mark.parametrize("element_set", ["cho", "chnops"])
148 | def test__make_exact_mass_difference_bounds(elements, element_set):
149 |     # test bounds for different element combinations
150 |     elements = elements[element_set]
151 |     elements = [PeriodicTable().get_element(x) for x in elements]
152 |     bounds = ef._make_exact_mass_difference_bounds(elements, 0.0)
153 |     # m and M are the bounds for each nominal mass increment
154 |     for e in elements:
155 |         nom, ex, ab = e.get_abundances()
156 |         nom = nom - nom[0]
157 |         ex = ex - ex[0]
158 |         for i, mi in zip(nom[1:], ex[1:]):
159 |             m_min, m_max = bounds[i]
160 |             assert m_min <= mi
161 |             assert m_max >= mi
162 | 
163 | 
164 | @pytest.mark.parametrize("element_set", ["cho", "chnops"])
165 | def test__get_next_mz_search_interval_mz(elements, formulas, element_set):
166 |     elements = elements[element_set]
167 |     elements = [PeriodicTable().get_element(x) for x in elements]
168 |     dM_bounds = ef._make_exact_mass_difference_bounds(elements, 0.0)
169 |     # test bounds for different formulas
170 |     for f_str in formulas[element_set]:
171 |         feature_list = create_feature_list_from_formula(f_str)
172 |         length = len(feature_list)
173 |         for k in range(1, length - 1):
174 |             k_ft = feature_list[k]
175 |             min_mz, max_mz = ef._get_next_mz_search_interval(
176 |                 feature_list[:k], dM_bounds, 1, 0.005
177 |             )
178 |             assert (min_mz < k_ft.mz) and (k_ft.mz < max_mz)
179 | 
180 | 
181 | @pytest.mark.parametrize("charge", list(range(1, 6)))
182 | def test_get_k_bounds_multiple_charges(elements, formulas, charge):
183 |     elements = elements["chnops"]
184 |     formulas = formulas["chnops"]
185 |     elements = [PeriodicTable().get_element(x) for x in elements]
186 |     bounds = ef._make_exact_mass_difference_bounds(elements, 0.0)
187 |     for f_str in formulas:
188 |         features = create_feature_list_from_formula(f"[{f_str}]{charge}+")
189 |         length = len(features)
190 |         for k in range(1, length - 1):
191 |             m_min, m_max = ef._get_next_mz_search_interval(
192 |                 features[:k], bounds, charge, 0.005
193 |             )
194 |             assert (m_min < features[k]) and (features[k] < m_max)
195 | 
196 | 
197 | @pytest.mark.parametrize(
198 |     "elements_set,charge", [["cho", 1], ["cho", 2], ["chnops", 1], ["chnops", 2]]
199 | )
200 | def test__find_envelopes(formulas, elements, elements_set, charge):
201 |     # test that the function works using as a list m/z values generated from
202 |     # formulas.
203 |     elements = elements[elements_set]
204 |     formulas = formulas[elements_set]
205 |     elements = [PeriodicTable().get_element(x) for x in elements]
206 |     bounds = ef._make_exact_mass_difference_bounds(elements, 0.0)
207 |     max_length = 10
208 |     mz_tol = 0.005
209 |     min_similarity = 0.9
210 |     for f_str in formulas:
211 |         f_str = f"[{f_str}]{charge}+"
212 |         features = create_feature_list_from_formula(f_str)
213 |         data = AnnotationData(features)
214 |         mmi = data.features[0]
215 |         results = ef._find_envelopes(
216 |             data.features,
217 |             mmi,
218 |             data.non_annotated,
219 |             data.similarity_cache,
220 |             charge,
221 |             max_length,
222 |             mz_tol,
223 |             min_similarity,
224 |             bounds,
225 |         )
226 |         expected = features
227 |         assert results[0] == expected
228 | 
229 | 
230 | @pytest.mark.parametrize("elements_set", ["cho", "chnops"])
231 | def test__find_envelopes_no_charge(formulas, elements, elements_set):
232 |     # test that the function works using as a list m/z values generated from
233 |     # formulas.
234 |     elements = elements[elements_set]
235 |     formulas = formulas[elements_set]
236 |     elements = [PeriodicTable().get_element(x) for x in elements]
237 |     bounds = ef._make_exact_mass_difference_bounds(elements, 0.0)
238 |     max_length = 10
239 |     charge = 0
240 |     mz_tol = 0.005
241 |     min_similarity = 0.9
242 |     for f_str in formulas:
243 |         features = create_feature_list_from_formula(f_str)
244 |         data = AnnotationData(features)
245 |         mmi = features[0]
246 |         results = ef._find_envelopes(
247 |             features,
248 |             mmi,
249 |             data.non_annotated,
250 |             data.similarity_cache,
251 |             charge,
252 |             max_length,
253 |             mz_tol,
254 |             min_similarity,
255 |             bounds,
256 |         )
257 |         expected = features
258 |         assert results[0] == expected
259 | 
260 | 
261 | def test_EnvelopeFinder(elements, formulas):
262 |     elements = elements["chnops"]
263 |     formulas = formulas["chnops"]
264 |     envelope_finder = ef.EnvelopeFinder(elements, 0.005, max_length=10)
265 |     charge = 1
266 |     for f_str in formulas:
267 |         features = create_feature_list_from_formula(f_str)
268 |         mmi = features[0]
269 |         data = AnnotationData(features)
270 |         results = envelope_finder.find(data, mmi, charge)
271 |         expected = features
272 |         assert len(results) == 1
273 |         assert results[0] == expected
274 | 


--------------------------------------------------------------------------------
/tests/unit/annotation/test_mmi_finder.py:
--------------------------------------------------------------------------------
  1 | from tidyms.annotation import mmi_finder
  2 | from tidyms.annotation.annotation_data import AnnotationData
  3 | from tidyms.chem import PeriodicTable
  4 | from tidyms.lcms import LCTrace, Peak
  5 | import pytest
  6 | import numpy as np
  7 | from typing import Sequence
  8 | 
  9 | 
 10 | def test__select_two_isotope_elements_dm_1_p0_greater_than_pi():
 11 |     elements = ["C", "H", "N", "O", "P", "S"]
 12 |     expected = ["C"]
 13 |     custom_abundances = dict()
 14 |     dm = 1
 15 |     res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
 16 |     assert len(res) == len(expected)
 17 |     assert set(res) == set(expected)
 18 | 
 19 | 
 20 | def test__select_two_isotope_elements_dm_1_p0_greater_than_pi_custom_abundance():
 21 |     elements = ["C", "H", "N", "O", "P", "S"]
 22 |     expected = ["H"]
 23 |     custom_abundances = {"H": np.array([0.95, 0.05])}
 24 |     dm = 1
 25 |     res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
 26 |     assert len(res) == len(expected)
 27 |     assert set(res) == set(expected)
 28 | 
 29 | 
 30 | def test__select_two_isotope_elements_dm_1_no_elements():
 31 |     elements = ["O", "P", "S"]
 32 |     custom_abundances = {}
 33 |     dm = 1
 34 |     res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
 35 |     assert len(res) == 0
 36 | 
 37 | 
 38 | def test__select_two_isotope_elements_dm_1_p0_lower_than_pi():
 39 |     elements = ["B", "Li", "O", "P", "S"]
 40 |     expected = ["B", "Li"]
 41 |     dm = 1
 42 |     custom_abundances = dict()
 43 |     res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
 44 |     assert len(res) == len(expected)
 45 |     assert set(res) == set(expected)
 46 | 
 47 | 
 48 | def test__select_two_isotope_elements_dm_1_p0_lower_and_higher_than_pi():
 49 |     elements = ["C", "H", "B", "Li", "O", "P", "S"]
 50 |     expected = ["C", "B", "Li"]
 51 |     dm = 1
 52 |     custom_abundances = dict()
 53 |     res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
 54 |     assert len(res) == len(expected)
 55 |     assert set(res) == set(expected)
 56 | 
 57 | 
 58 | def test__select_two_isotope_elements_dm_2_p0_greater_than_pi():
 59 |     elements = ["Cl", "H", "N", "O", "P", "S"]
 60 |     expected = ["Cl"]
 61 |     custom_abundances = dict()
 62 |     dm = 2
 63 |     res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
 64 |     assert len(res) == len(expected)
 65 |     assert set(res) == set(expected)
 66 | 
 67 | 
 68 | def test__select_two_isotope_elements_dm_2_p0_greater_than_pi_custom_abundance():
 69 |     elements = ["Cl", "Br", "N", "O", "P", "S"]
 70 |     expected = ["Cl"]
 71 |     # Br abundance adjusted to force the result to be Cl
 72 |     custom_abundances = {"Br": np.array([0.9, 0.1])}
 73 |     dm = 2
 74 |     res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
 75 |     assert len(res) == len(expected)
 76 |     assert set(res) == set(expected)
 77 | 
 78 | 
 79 | def test__select_two_isotope_elements_dm_2_no_elements():
 80 |     elements = ["O", "P", "S"]
 81 |     custom_abundances = {}
 82 |     dm = 2
 83 |     res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
 84 |     assert len(res) == 0
 85 | 
 86 | 
 87 | def test__select_two_isotope_elements_dm_2_p0_lower_than_pi():
 88 |     elements = ["In", "H", "O", "P", "S"]
 89 |     expected = ["In"]
 90 |     dm = 2
 91 |     custom_abundances = dict()
 92 |     res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
 93 |     assert len(res) == len(expected)
 94 |     assert set(res) == set(expected)
 95 | 
 96 | 
 97 | def test__select_two_isotope_elements_dm_2_p0_lower_and_higher_than_pi():
 98 |     elements = ["Cl", "In", "Br", "O", "P", "S"]
 99 |     expected = ["Br", "In"]
100 |     dm = 2
101 |     custom_abundances = dict()
102 |     res = mmi_finder._select_two_isotope_element(elements, dm, custom_abundances)
103 |     assert len(res) == len(expected)
104 |     assert set(res) == set(expected)
105 | 
106 | 
107 | def test__select_multiple_isotope_elements():
108 |     elements = ["Cl", "H", "N", "O", "P", "S"]
109 |     expected = ["O", "S"]
110 |     res = mmi_finder._select_multiple_isotope_elements(elements)
111 |     assert len(res) == len(expected)
112 |     assert set(res) == set(expected)
113 | 
114 | 
115 | def test__select_multiple_isotope_elements_no_elements():
116 |     elements = ["Cl", "H", "N", "P"]
117 |     expected = []
118 |     res = mmi_finder._select_multiple_isotope_elements(elements)
119 |     assert len(res) == len(expected)
120 |     assert set(res) == set(expected)
121 | 
122 | 
123 | @pytest.mark.parametrize(
124 |     "elements,expected",
125 |     [
126 |         [["C", "H", "N", "O", "P", "S"], ["C", "O", "S"]],
127 |         [["C", "H", "N", "O", "P", "S", "Cl", "Li", "Na"], ["C", "O", "S", "Li", "Cl"]],
128 |     ],
129 | )
130 | def test__select_elements(elements, expected):
131 |     res = mmi_finder._select_elements(elements)
132 |     res = [x.symbol for x in res]
133 |     assert len(res) == len(expected)
134 |     assert set(res) == set(expected)
135 | 
136 | 
137 | @pytest.fixture
138 | def rules():
139 |     bounds = {"C": (0, 108), "H": (0, 100), "S": (0, 8), "Cl": (0, 2)}
140 |     max_mass = 2000.0
141 |     length = 5
142 |     bin_size = 100
143 |     p_tol = 0.05
144 |     r = mmi_finder._create_rules_dict(bounds, max_mass, length, bin_size, p_tol, None)
145 |     return r, max_mass, length, bin_size
146 | 
147 | 
148 | def create_peak_list(mz: list[float], sp: list[float]) -> Sequence[Peak]:
149 |     peak_list = list()
150 |     size = 30
151 |     time = np.linspace(0, size, size)
152 |     scan = np.arange(size)
153 |     spint = np.ones(size)
154 |     for k_mz, k_sp in zip(mz, sp):
155 |         roi = LCTrace(time.copy(), spint * k_sp, spint * k_mz, scan)
156 |         peak = Peak(10, 15, 20, roi)
157 |         peak_list.append(peak)
158 |     return peak_list
159 | 
160 | 
161 | def test__find_candidates(rules):
162 |     rules, max_mass, length, bin_size = rules
163 |     # create an m/z and sp list where the monoisotopic m/z is the M1 in the
164 |     # isotopic envelope.
165 | 
166 |     _, M_cl, _ = PeriodicTable().get_element("Cl").get_abundances()
167 |     dm_cl = M_cl[1] - M_cl[0]
168 |     mono_mz = 400.0
169 |     charge = 1
170 |     mono_index = 3
171 |     mz = [100.0, 300.0, mono_mz - dm_cl, mono_mz, 456.0]
172 |     sp = [100.0, 200.0, 500.0, 501.0, 34.0]
173 |     peak_list = create_peak_list(mz, sp)
174 |     monoisotopologue = peak_list[mono_index]
175 | 
176 |     # find the rule to search the mmi candidate
177 |     m_bin = int(mono_mz // bin_size)
178 |     i_rules = rules.get(m_bin)[0]
179 |     mz_tol = 0.005
180 |     p_tol = 0.05
181 |     min_similarity = 0.9
182 | 
183 |     data = AnnotationData(peak_list)
184 | 
185 |     test_candidates = mmi_finder._find_candidate(
186 |         data, monoisotopologue, charge, i_rules, mz_tol, p_tol, max_mass, min_similarity
187 |     )
188 |     mmi = peak_list[2]
189 |     expected_candidates = [(mmi, 1)]
190 |     assert test_candidates == expected_candidates
191 | 
192 | 
193 | def test__find_candidates_multiple_candidates(rules):
194 |     rules, max_mass, length, bin_size = rules
195 |     # create an m/z and sp list where the monoisotopic m/z is the M1 in the
196 |     # isotopic envelope.
197 |     _, M_cl, _ = PeriodicTable().get_element("Cl").get_abundances()
198 |     dm_cl = M_cl[1] - M_cl[0]
199 |     mono_mz = 400.0
200 |     charge = 1
201 |     mono_index = 4
202 |     M01 = mono_mz - dm_cl
203 |     M02 = M01 + 0.00001
204 |     mz = [100.0, 300.0, M01, M02, mono_mz, 456.0]
205 |     sp = [100.0, 200.0, 500.0, 500.5, 501.0, 34.0]
206 |     peak_list = create_peak_list(mz, sp)
207 |     monoisotopologue = peak_list[mono_index]
208 | 
209 |     # find the rule to search the mmi candidate
210 |     m_bin = int(mono_mz // bin_size)
211 |     i_rules = rules.get(m_bin)[0]
212 |     mz_tol = 0.005
213 |     p_tol = 0.05
214 |     min_similarity = 0.9
215 | 
216 |     data = AnnotationData(peak_list)
217 | 
218 |     test_candidates = mmi_finder._find_candidate(
219 |         data, monoisotopologue, charge, i_rules, mz_tol, p_tol, max_mass, min_similarity
220 |     )
221 |     expected_candidates = [(peak_list[2], 1), (peak_list[3], 1)]
222 |     assert test_candidates == expected_candidates
223 | 
224 | 
225 | def test__find_candidates_no_candidates(rules):
226 |     rules, max_mass, length, bin_size = rules
227 |     # create an m/z and sp list where the monoisotopic m/z is the M1 in the
228 |     # isotopic envelope.
229 |     _, M_cl, _ = PeriodicTable().get_element("Cl").get_abundances()
230 |     mono_mz = 400.0
231 |     charge = 1
232 |     mono_index = 2
233 |     mz = [100.0, 300.0, mono_mz, 456.0]
234 |     sp = [100.0, 200.0, 501.0, 34.0]
235 |     peak_list = create_peak_list(mz, sp)
236 |     monoisotopologue = peak_list[mono_index]
237 | 
238 |     # find the rule to search the mmi candidate
239 |     m_bin = int(mono_mz // bin_size)
240 |     i_rules = rules.get(m_bin)[0]
241 |     mz_tol = 0.005
242 |     p_tol = 0.05
243 |     min_similarity = 0.9
244 | 
245 |     data = AnnotationData(peak_list)
246 | 
247 |     test_candidates = mmi_finder._find_candidate(
248 |         data, monoisotopologue, charge, i_rules, mz_tol, p_tol, max_mass, min_similarity
249 |     )
250 |     assert len(test_candidates) == 0
251 | 
252 | 
253 | def test_MMIFinder():
254 |     bounds = {"C": (0, 108), "H": (0, 100), "S": (0, 8), "Cl": (0, 2)}
255 |     max_mass = 2000.0
256 |     length = 5
257 |     bin_size = 100
258 |     max_charge = 3
259 |     mz_tol = 0.005
260 |     p_tol = 0.05
261 |     min_similarity = 0.9
262 |     finder = mmi_finder.MMIFinder(
263 |         bounds, max_mass, max_charge, length, bin_size, mz_tol, p_tol, min_similarity
264 |     )
265 | 
266 |     _, M_cl, _ = PeriodicTable().get_element("Cl").get_abundances()
267 |     dm_cl = M_cl[1] - M_cl[0]
268 |     mono_mz = 400.0
269 |     mz = [100.0, 300.0, mono_mz - dm_cl, mono_mz, 456.0]
270 |     sp = [100.0, 200.0, 500.0, 501.0, 34.0]
271 |     peak_list = create_peak_list(mz, sp)
272 |     data = AnnotationData(peak_list)
273 |     monoisotopologue = data.get_monoisotopologue()
274 |     test_mmi_index = finder.find(data)
275 |     expected_mmi_index = [
276 |         (monoisotopologue, 1),
277 |         (monoisotopologue, 2),
278 |         (monoisotopologue, 3),
279 |         (peak_list[2], 1),
280 |     ]
281 |     # check with set because features may be in a different order
282 |     assert set(test_mmi_index) == set(expected_mmi_index)
283 | 


--------------------------------------------------------------------------------
/tests/unit/test_batch_corrector.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from tidyms import _batch_corrector
  3 | # import pytest
  4 | from statsmodels.nonparametric.smoothers_lowess import lowess
  5 | 
  6 | 
  7 | def test_correct_batches(data_container_with_order):
  8 |     data_matrix = data_container_with_order.data_matrix
  9 |     sample_metadata = data_container_with_order.sample_metadata
 10 |     sample_class = ["healthy", "disease"]
 11 |     qc_class = ["QC"]
 12 |     _batch_corrector.correct_batches(
 13 |         data_matrix,
 14 |         sample_metadata,
 15 |         sample_class,
 16 |         qc_class,
 17 |         verbose=False
 18 |     )
 19 |     assert True
 20 | 
 21 | 
 22 | def test_correct_batches_frac(data_container_with_order):
 23 |     # test specifying a frac value
 24 |     data_matrix = data_container_with_order.data_matrix
 25 |     sample_metadata = data_container_with_order.sample_metadata
 26 |     sample_class = ["healthy", "disease"]
 27 |     qc_class = ["QC"]
 28 |     _batch_corrector.correct_batches(
 29 |         data_matrix,
 30 |         sample_metadata,
 31 |         sample_class,
 32 |         qc_class,
 33 |         frac=0.7,
 34 |         verbose=False
 35 |     )
 36 |     assert True
 37 | 
 38 | 
 39 | def test_correct_batches_first_n(data_container_with_order):
 40 |     # test specifying a frac value
 41 |     data_matrix = data_container_with_order.data_matrix
 42 |     sample_metadata = data_container_with_order.sample_metadata
 43 |     sample_class = ["healthy", "disease"]
 44 |     qc_class = ["QC"]
 45 |     _batch_corrector.correct_batches(
 46 |         data_matrix,
 47 |         sample_metadata,
 48 |         sample_class,
 49 |         qc_class,
 50 |         first_n=1,
 51 |         verbose=False
 52 |     )
 53 |     assert True
 54 | 
 55 | 
 56 | def test_lowess_min_n_samples():
 57 |     # check that the if n is lower or equal than 3 lowess return the same value
 58 |     n = 4
 59 |     for k in range(2, n):
 60 |         x = np.arange(k)
 61 |         y = np.random.normal(size=k)
 62 |         y_fit = lowess(y, x, is_sorted=True, return_sorted=False)
 63 |         assert np.allclose(y, y_fit)
 64 | 
 65 | 
 66 | def test_split_data_matrix(data_container_with_order):
 67 |     # Test if we can rebuild the matrix from the fragments
 68 |     data_matrix = data_container_with_order.data_matrix
 69 |     sample_metadata = data_container_with_order.sample_metadata
 70 |     sample_class = ["healthy", "disease"]
 71 |     qc_class = ["QC"]
 72 |     iterator = _batch_corrector._split_data_matrix(
 73 |         data_matrix,
 74 |         sample_metadata,
 75 |         sample_class,
 76 |         qc_class,
 77 |         0.0
 78 |     )
 79 |     rebuilt = np.zeros(shape=data_matrix.shape, dtype=float)
 80 |     for start, k, order, xgk, _, _ in iterator:
 81 |         rebuilt[start + np.arange(xgk.size), k] = xgk.flatten()
 82 |     assert np.array_equal(data_matrix.to_numpy(), rebuilt)
 83 | 
 84 | 
 85 | def test_rebuild_data_matrix(data_container_with_order):
 86 |     # Test if we can rebuild the matrix from the fragments
 87 |     data_matrix = data_container_with_order.data_matrix
 88 |     sample_metadata = data_container_with_order.sample_metadata
 89 |     sample_class = ["healthy", "disease"]
 90 |     qc_class = ["QC"]
 91 |     iterator = _batch_corrector._split_data_matrix(
 92 |         data_matrix,
 93 |         sample_metadata,
 94 |         sample_class,
 95 |         qc_class,
 96 |         0.0
 97 |     )
 98 | 
 99 |     # compute index used to rebuild the matrix but don't modify the values
100 |     def process_chunk(args):
101 |         start_index, column, order, x, train_index, predict_index = args
102 |         index = np.arange(x.size) + start_index
103 |         return x, index, column
104 | 
105 |     chunks = [process_chunk(x) for x in iterator]
106 |     shape = data_matrix.shape
107 |     rebuilt = _batch_corrector._rebuild_data_matrix(shape, chunks)
108 |     X = data_matrix.to_numpy()
109 |     assert np.array_equal(X, rebuilt)
110 | 
111 | 
112 | def test_find_invalid_samples(data_container_with_order):
113 |     data = data_container_with_order
114 |     sample_metadata = data.sample_metadata
115 |     sample_class = data.mapping["sample"]
116 |     qc_class = data.mapping["qc"]
117 |     invalid_samples = _batch_corrector.find_invalid_samples(
118 |         sample_metadata,
119 |         sample_class,
120 |         qc_class
121 |     )
122 |     assert invalid_samples.size == 0
123 | 
124 | def test_find_invalid_samples_remove_first_block(data_container_with_order):
125 |     # check if study samples with order lower than qc samples are removed
126 |     data = data_container_with_order
127 |     sample_metadata = data.sample_metadata.copy()
128 |     sample_class = data.mapping["sample"]
129 |     qc_class = data.mapping["qc"]
130 |     # modify one value at the beginning
131 |     sample_metadata.at[sample_metadata.index[0], "class"] = sample_class[0]
132 |     invalid_samples = _batch_corrector.find_invalid_samples(
133 |         sample_metadata,
134 |         sample_class,
135 |         qc_class
136 |     )
137 |     assert invalid_samples.size == 1
138 | 
139 | 
140 | def test_find_invalid_samples_remove_last_block(data_container_with_order):
141 |     # check if study samples with order lower than qc samples are removed
142 |     data = data_container_with_order
143 |     sample_metadata = data.sample_metadata.copy()
144 |     sample_class = data.mapping["sample"]
145 |     qc_class = data.mapping["qc"]
146 |     # modify one value at the beginning
147 |     sample_metadata.at[sample_metadata.index[-1], "class"] = sample_class[0]
148 |     invalid_samples = _batch_corrector.find_invalid_samples(
149 |         sample_metadata,
150 |         sample_class,
151 |         qc_class
152 |     )
153 |     assert invalid_samples.size == 1
154 | 
155 | 
156 | def test_find_invalid_samples_invalid_batch(
157 |         data_container_with_order_single_qc):
158 |     # check if study samples with order lower than qc samples are removed
159 |     data = data_container_with_order_single_qc
160 |     sample_metadata = data.sample_metadata
161 |     sample_class = data.mapping["sample"]
162 |     qc_class = data.mapping["qc"]
163 |     # the third batch have only two QC samples and must be removed.
164 |     n_invalid = sample_metadata["batch"].value_counts()[3]
165 |     invalid_samples = _batch_corrector.find_invalid_samples(
166 |         sample_metadata,
167 |         sample_class,
168 |         qc_class
169 |     )
170 |     assert invalid_samples.size == n_invalid
171 | 
172 | 
173 | def test_find_invalid_features(data_container_with_order):
174 |     data = data_container_with_order
175 |     data_matrix = data.data_matrix
176 |     sample_metadata = data.sample_metadata
177 |     sample_class = data.mapping["sample"]
178 |     qc_class = data.mapping["qc"]
179 |     threshold = 0.0
180 |     min_detection_rate = 1.0
181 |     invalid_features = _batch_corrector.find_invalid_features(
182 |         data_matrix,
183 |         sample_metadata,
184 |         sample_class,
185 |         qc_class,
186 |         threshold,
187 |         min_detection_rate
188 |     )
189 |     assert invalid_features.size == 0
190 | 
191 | 
192 | def test_find_invalid_features_threshold(data_container_with_order):
193 |     # using high threshold, all features should be removed
194 |     data = data_container_with_order
195 |     data_matrix = data.data_matrix
196 |     sample_metadata = data.sample_metadata
197 |     sample_class = data.mapping["sample"]
198 |     qc_class = data.mapping["qc"]
199 |     threshold = 10000000.0
200 |     min_detection_rate = 1.0
201 |     invalid_features = _batch_corrector.find_invalid_features(
202 |         data_matrix,
203 |         sample_metadata,
204 |         sample_class,
205 |         qc_class,
206 |         threshold,
207 |         min_detection_rate
208 |     )
209 |     assert invalid_features.size == data_matrix.shape[1]


--------------------------------------------------------------------------------
/tests/unit/test_chem/test_atoms.py:
--------------------------------------------------------------------------------
 1 | from tidyms.chem import atoms
 2 | import pytest
 3 | 
 4 | 
 5 | def test_PeriodicTable_get_element_from_symbol():
 6 |     ptable = atoms.PeriodicTable()
 7 |     c = ptable.get_element("C")
 8 |     assert c.z == 6
 9 |     assert c.symbol == "C"
10 | 
11 | 
12 | def test_PeriodicTable_get_element_from_z():
13 |     ptable = atoms.PeriodicTable()
14 |     p = ptable.get_element(15)
15 |     assert p.symbol == "P"
16 |     assert p.z == 15
17 | 
18 | 
19 | def test_PeriodicTable_get_isotope_from_symbol():
20 |     ptable = atoms.PeriodicTable()
21 |     cl37 = ptable.get_isotope("37Cl")
22 |     assert cl37.a == 37
23 |     assert cl37.get_symbol() == "Cl"
24 | 
25 | 
26 | def test_PeriodicTable_get_isotope_copy():
27 |     ptable = atoms.PeriodicTable()
28 |     isotope_str = "37Cl"
29 |     cl37_copy = ptable.get_isotope(isotope_str, copy=True)
30 |     cl37 = ptable.get_isotope(isotope_str)
31 |     assert cl37.a == cl37_copy.a
32 |     assert cl37.m == cl37_copy.m
33 |     assert cl37.z == cl37_copy.z
34 |     assert cl37 is not cl37_copy
35 | 
36 | 
37 | @pytest.mark.parametrize(
38 |     "z,a,m,abundance,expected_symbol",
39 |     [
40 |         [6, 12, 12.0, 0.9, "C"],    # Carbon. Dummy abundances and exact mass are used.
41 |         [1, 1, 1.0078, 0.9, "H"],   # Hydrogen
42 |         [15, 31, 30.099, 1.0, "P"]  # Phosphorus
43 |     ]
44 | )
45 | def test_Isotope_get_symbol(z, a, m, abundance, expected_symbol):
46 |     isotope = atoms.Isotope(z, a, m, abundance)
47 |     assert isotope.get_symbol() == expected_symbol
48 | 
49 | 
50 | def test_Element_get_monoisotope():
51 |     element = atoms.PeriodicTable().get_element("B")
52 |     monoisotope = element.get_monoisotope()
53 |     assert monoisotope.a == 11
54 | 
55 | 
56 | def test_Element_get_mmi():
57 |     element = atoms.PeriodicTable().get_element("B")
58 |     mmi = element.get_mmi()
59 |     assert mmi.a == 10
60 | 


--------------------------------------------------------------------------------
/tests/unit/test_chem/test_formula.py:
--------------------------------------------------------------------------------
  1 | from tidyms.chem import formula
  2 | from tidyms.chem.atoms import InvalidIsotope, PeriodicTable
  3 | import pytest
  4 | 
  5 | 
  6 | @pytest.mark.parametrize(
  7 |     "formula_str,p_open,p_close",
  8 |     [
  9 |         ("[Cr[H2O]6]3+", 0, 9),
 10 |         ("[C9H11NO2]", 0, 9),
 11 |         ("C9H11N(17O)2", 6, 10),
 12 |         ("[Cr[(2H)2O]6]3+", 3, 10),
 13 |     ],
 14 | )
 15 | def test_find_matching_parenthesis_valid_input(formula_str, p_open, p_close):
 16 |     test_p_close = formula._find_matching_parenthesis(formula_str, p_open)
 17 |     assert test_p_close == p_close
 18 | 
 19 | 
 20 | @pytest.mark.parametrize(
 21 |     "formula_str,formula_without_charge,charge",
 22 |     [
 23 |         ("H2O", "H2O", 0),
 24 |         ("(13C)", "(13C)", 0),
 25 |         ("[CO3]2-", "CO3", -2),
 26 |         ("[Cr[H2O]6]3+", "Cr[H2O]6", 3),
 27 |         ("[C9H11NO2]", "[C9H11NO2]", 0),
 28 |         ("CO-", "CO", -1),
 29 |         ("[H2O]+", "[H2O]", 1),
 30 |         ("H2O+", "H2O", 1),
 31 |     ],
 32 | )
 33 | def test_parse_charge_valid_input(formula_str, formula_without_charge, charge):
 34 |     test_formula_without_charge, q = formula._parse_charge(formula_str)
 35 |     assert test_formula_without_charge == formula_without_charge
 36 |     assert charge == q
 37 | 
 38 | 
 39 | @pytest.mark.parametrize("formula_str", ["SO42-"])
 40 | def test_parse_charge_invalid_input(formula_str):
 41 |     with pytest.raises(formula.InvalidFormula):
 42 |         _, q = formula._parse_charge(formula_str)
 43 | 
 44 | 
 45 | @pytest.mark.parametrize(
 46 |     "formula_str,ind,token_type",
 47 |     [
 48 |         ("H2O", 0, 0),
 49 |         ("H2(34S)O4", 2, 1),
 50 |         ("[Cr(H2O)6]3+", 3, 2),
 51 |         ("[Fe[CN]6]4-", 3, 2),
 52 |     ],
 53 | )
 54 | def test_get_token_type(formula_str, ind, token_type):
 55 |     test_token_type = formula._get_token_type(formula_str, ind)
 56 |     assert test_token_type == token_type
 57 | 
 58 | 
 59 | @pytest.mark.parametrize(
 60 |     "formula_str,ind,coeff,new_ind",
 61 |     [
 62 |         ("H2O", 3, 1, 3),
 63 |         ("CO2", 1, 1, 1),
 64 |         ("C9H11NO2", 3, 11, 5),
 65 |     ]
 66 | )
 67 | def test_get_coefficient_valid_input(formula_str, ind, coeff, new_ind):
 68 |     test_coeff, test_ind = formula._get_coefficient(formula_str, ind)
 69 |     assert coeff == test_coeff
 70 |     assert test_ind == new_ind
 71 | 
 72 | 
 73 | @pytest.mark.parametrize(
 74 |     "formula_str,ind,new_ind,element",
 75 |     [
 76 |         ("H2O", 0, 2, "H"),
 77 |         ("H2O", 2, 3, "O"),
 78 |         ("C9H11NO2", 5, 6, "N"),
 79 |         ("C9H11N(17O)2", 5, 6, "N"),
 80 |         ("Cr(H2O)6", 0, 2, "Cr"),
 81 |     ]
 82 | )
 83 | def test_tokenize_element_valid_input(formula_str, ind, new_ind, element):
 84 |     token, test_index = formula._tokenize_element(formula_str, ind)
 85 |     assert test_index == new_ind
 86 |     isotope = PeriodicTable().get_element(element).get_monoisotope()
 87 |     assert isotope in token
 88 | 
 89 | 
 90 | @pytest.mark.parametrize(
 91 |     "formula_str,ind,isotope_str,new_ind",
 92 |     [
 93 |         ("(13C)O2", 0, "13C", 5),
 94 |         ("C9H11(15N)2O2", 5, "15N", 11),
 95 |         ("C6H12O5(18O)", 7, "18O", 12),
 96 |         ("C6H12O4(18O)2", 7, "18O", 13),
 97 |     ]
 98 | )
 99 | def test_tokenize_isotope_valid_input(formula_str, ind, isotope_str, new_ind):
100 |     token, test_index = formula._tokenize_isotope(formula_str, ind)
101 |     isotope = PeriodicTable().get_isotope(isotope_str)
102 |     assert test_index == new_ind
103 |     assert isotope in token
104 | 
105 | 
106 | @pytest.mark.parametrize(
107 |     "f_str,composition",
108 |     [
109 |         ("H2O", {"1H": 2, "16O": 1}),
110 |         ("(13C)O2", {"13C": 1, "16O": 2}),
111 |         ("C9H11(15N)2O2", {"12C": 9, "1H": 11, "15N": 2, "16O": 2}),
112 |         ("C9H11N2O2", {"12C": 9, "1H": 11, "14N": 2, "16O": 2}),
113 |         ("Cr[(2H)2O]6", {"52Cr": 1, "2H": 12, "16O": 6})
114 |     ]
115 | )
116 | def test_tokenize_formula(f_str, composition):
117 |     composition = {PeriodicTable().get_isotope(k): v for k, v in composition.items()}
118 |     test_composition = formula._parse_formula(f_str)
119 |     for isotope in composition:
120 |         assert composition[isotope] == test_composition[isotope]
121 | 
122 | 
123 | def test_arg_sort_elements():
124 |     symbols = ["Cd", "C", "H", "H", "O", "O", "S", "B"]
125 |     a = [60, 12, 2, 1, 16, 17, 32, 7]
126 |     sorted_ind = [7, 1, 0, 3, 2, 4, 5, 6]
127 |     assert sorted_ind == formula._arg_sort_elements(symbols, a)
128 | 
129 | 
130 | @pytest.mark.parametrize(
131 |     "charge,charge_str",
132 |     [
133 |         (1, "+"),
134 |         (2, "2+"),
135 |         (-1, "-"),
136 |         (-4, "4-")
137 |     ]
138 | )
139 | def test_get_charge_str(charge, charge_str):
140 |     test_charge_str = formula._get_charge_str(charge)
141 |     assert test_charge_str == charge_str
142 | 
143 | 
144 | @pytest.mark.parametrize(
145 |     "f,f_str",
146 |     [
147 |         (formula.Formula("CO2"), "CO2"),
148 |         (formula.Formula("(13C)C2H6O3"), "C2(13C)H6O3"),
149 |         (formula.Formula("C24H46SPN(18O)2"), "C24H46N(18O)2PS"),
150 |         (formula.Formula("[Cr(H2O)6]3+"), "[H12CrO6]3+"),
151 |         (formula.Formula("CH3CH2CH3"), "C3H8"),
152 |         (formula.Formula("F2"), "F2"),
153 |     ]
154 | )
155 | def test_get_formula_str(f, f_str):
156 |     test_f_str = str(f)
157 |     assert test_f_str == f_str
158 | 
159 | 
160 | @pytest.mark.parametrize("f_str", ["(CO2", "#H2O"])
161 | def test_parse_formula_invalid_formula(f_str):
162 |     with pytest.raises(formula.InvalidFormula):
163 |         formula.Formula(f_str)
164 | 
165 | 
166 | @pytest.mark.parametrize("f_str", ["(14C)O2", "(3H)2O"])
167 | def test_parse_formula_invalid_isotope(f_str):
168 |     with pytest.raises(InvalidIsotope):
169 |         formula.Formula(f_str)
170 | 
171 | 
172 | @pytest.fixture
173 | def formula_data():
174 |     formula_str = ["CO2", "H2O", "F2"]
175 |     nominal = [44, 18, 38]
176 |     exact = [43.9898, 18.0106, 37.9968]
177 |     return formula_str, nominal, exact
178 | 
179 | 
180 | def test_get_exact_mass(formula_data):
181 |     formula_str, _, exact = formula_data
182 |     for f_str, e in zip(formula_str, exact):
183 |         assert abs(formula.Formula(f_str).get_exact_mass() - e) < 0.0001
184 | 
185 | 
186 | def test_get_nominal_mass(formula_data):
187 |     formula_str, nominal, _ = formula_data
188 |     for f_str, n in zip(formula_str, nominal):
189 |         assert formula.Formula(f_str).get_nominal_mass() == n
190 | 
191 | 
192 | def test_formula_from_dictionary():
193 |     composition = {"C": 1, "17O": 2, "H": 2}
194 |     charge = 1
195 |     f = formula.Formula(composition, charge)
196 |     for k in composition:
197 |         assert PeriodicTable().get_isotope(k) in f.composition
198 |     assert charge == f.charge
199 | 
200 | 
201 | def test_formula_from_dictionary_invalid_isotope():
202 |     composition = {"C": 1, "G": 4}
203 |     charge = 1
204 |     with pytest.raises(InvalidIsotope):
205 |         formula.Formula(composition, charge)
206 | 
207 | 
208 | def test_formula_from_dictionary_invalid_isotope_type():
209 |     composition = {4: 1, "G": 4}
210 |     charge = 1
211 |     with pytest.raises(ValueError):
212 |         formula.Formula(composition, charge)
213 | 
214 | 
215 | @pytest.mark.parametrize(
216 |     "composition,q",
217 |     [
218 |         [{"C": -1, "H": 4}, 1],
219 |         [{"C": 1, "H": 4}, 0.5],
220 |     ]
221 | )
222 | def test_formula_from_dictionary_invalid_coefficient(composition, q):
223 |     with pytest.raises(ValueError):
224 |         formula.Formula(composition, q)
225 | 
226 | 
227 | def test_Formula_add():
228 |     f1 = formula.Formula("H2O")
229 |     f2 = formula.Formula("CO2")
230 |     f_sum = f1 + f2
231 |     expected = formula.Formula("H2CO3")
232 |     assert expected == f_sum
233 | 
234 | 
235 | def test_Formula_add_invalid_type():
236 |     f1 = formula.Formula("H2O")
237 |     f2 = "CO2"
238 |     with pytest.raises(ValueError):
239 |         f1 + f2
240 | 
241 | 
242 | def test_Formula_subtract_valid():
243 |     f1 = formula.Formula("C6H12O6")
244 |     f2 = formula.Formula("CO2")
245 |     f_diff = f1 - f2
246 |     expected = formula.Formula("C5H12O4")
247 |     assert expected == f_diff
248 | 
249 | 
250 | def test_Formula_subtract_invalid_type():
251 |     f1 = formula.Formula("C6H12O6")
252 |     f2 = "CO2"
253 |     with pytest.raises(ValueError):
254 |         f1 - f2
255 | 
256 | 
257 | def test_Formula_subtract_valid_zero_coeff():
258 |     f1 = formula.Formula("C4H8O2")
259 |     f2 = formula.Formula("CO2")
260 |     f_diff = f1 - f2
261 |     expected = formula.Formula("C3H8")
262 |     assert expected == f_diff
263 | 
264 | 
265 | def test_Formula_subtract_invalid_coeff():
266 |     f1 = formula.Formula("C4H8O")
267 |     f2 = formula.Formula("CO2")
268 |     with pytest.raises(ValueError):
269 |         f1 - f2
270 | 


--------------------------------------------------------------------------------
/tests/unit/test_chem/test_isotope_distributions.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | from tidyms.chem import _envelope_utils as ids
  4 | from tidyms.chem import Formula, PeriodicTable
  5 | from itertools import product
  6 | 
  7 | 
  8 | @pytest.mark.parametrize(
  9 |     "isotope_symbol,n,max_length",
 10 |     product(["2H", "31P"], [0, 1, 5], [1, 2, 5]))
 11 | def test__get_n_isotopes_envelope(isotope_symbol: str, n: int, max_length: int):
 12 |     isotope = PeriodicTable().get_isotope(isotope_symbol)
 13 |     M, p = ids._get_n_isotopes_envelope(isotope, n, max_length)
 14 |     M_expected = np.zeros(max_length)
 15 |     M_expected[0] = n * isotope.m
 16 |     p_expected = np.zeros(max_length)
 17 |     p_expected[0] = 1.0
 18 |     assert np.array_equal(M, M_expected)
 19 |     assert np.array_equal(p, p_expected)
 20 | 
 21 | 
 22 | def test__validate_abundance_valid_value():
 23 |     symbol = "C"
 24 |     c = PeriodicTable().get_element(symbol)
 25 |     mc, _, _ = c.get_abundances()
 26 |     p = np.array([0.8, 0.2])
 27 |     ids._validate_abundance(p, mc, symbol)
 28 | 
 29 | 
 30 | def test__validate_abundance_negative_values():
 31 |     symbol = "C"
 32 |     c = PeriodicTable().get_element(symbol)
 33 |     mc, _, _ = c.get_abundances()
 34 |     p = np.array([0.8, -0.01])
 35 |     with pytest.raises(ValueError):
 36 |         ids._validate_abundance(p, mc, symbol)
 37 | 
 38 | 
 39 | def test__validate_abundance_non_normalized():
 40 |     symbol = "C"
 41 |     c = PeriodicTable().get_element(symbol)
 42 |     mc, _, _ = c.get_abundances()
 43 |     p = np.array([0.8, 0.21])
 44 |     with pytest.raises(ValueError):
 45 |         ids._validate_abundance(p, mc, symbol)
 46 | 
 47 | 
 48 | def test__validate_abundance_invalid_length():
 49 |     symbol = "C"
 50 |     c = PeriodicTable().get_element(symbol)
 51 |     mc, _, _ = c.get_abundances()
 52 |     p = np.array([0.8, 0.015, 0.05])
 53 |     with pytest.raises(ValueError):
 54 |         ids._validate_abundance(p, mc, symbol)
 55 | 
 56 | 
 57 | @pytest.mark.parametrize(
 58 |     "n_isotopes,n",
 59 |     [[1, 1], [1, 2], [1, 5], [1, 10], [2, 1], [2, 5], [2, 20], [5, 1], [5, 10]]
 60 | )
 61 | def test__find_n_isotopes_combination(n_isotopes, n):
 62 |     comb = ids._find_n_isotope_combination(n_isotopes, n)
 63 |     expected = [x for x in product(range(n + 1), repeat=n_isotopes) if sum(x) == n]
 64 |     expected = np.array(expected)
 65 |     # check that the row content is equal
 66 |     for x in expected:
 67 |         assert x in comb
 68 |     for x in comb:
 69 |         assert x in expected
 70 | 
 71 | 
 72 | @pytest.mark.parametrize(
 73 |     "element,max_length",
 74 |     product(["C", "S"], [2, 5, 10]))
 75 | def test__get_n_atoms_envelope_aux_n_1(element: str, max_length: int):
 76 |     element = PeriodicTable().get_element(element)
 77 |     me, Me, pe = element.get_abundances()
 78 |     M, p = ids._get_n_atoms_envelope_aux(me, Me, pe, 1, max_length)
 79 |     Me, pe = ids._fill_missing_nominal(me, Me, pe, max_length)
 80 |     assert np.allclose(M, Me)
 81 |     assert np.allclose(p, pe / np.sum(pe))
 82 | 
 83 | 
 84 | def test__get_n_atoms_envelope_aux_c_n_3_max_length_3():
 85 |     element = PeriodicTable().get_element("C")
 86 |     m_c12 = 12
 87 |     m_c13 = element.isotopes[13].m
 88 |     me, Me, pe = element.get_abundances()
 89 |     n = 3
 90 |     max_length = 3
 91 |     M, p = ids._get_n_atoms_envelope_aux(me, Me, pe, n, max_length)
 92 |     M_expected = np.array([3 * m_c12, 2 * m_c12 + m_c13, 12 + 2 * m_c13])
 93 |     assert np.allclose(M, M_expected)
 94 |     assert np.allclose(np.sum(pe), 1.0)
 95 | 
 96 | 
 97 | def test__get_n_atoms_envelope_aux_c_n_3_max_length_5():
 98 |     element = PeriodicTable().get_element("C")
 99 |     m_c12 = 12
100 |     m_c13 = element.isotopes[13].m
101 |     me, Me, pe = element.get_abundances()
102 |     n = 3
103 |     max_length = 5
104 |     M, p = ids._get_n_atoms_envelope_aux(me, Me, pe, n, max_length)
105 |     M_expected = np.array([3 * m_c12, 2 * m_c12 + m_c13, 12 + 2 * m_c13, 3 * m_c13, 0])
106 |     assert np.allclose(M, M_expected)
107 |     assert np.allclose(np.sum(pe), 1.0)
108 | 
109 | 
110 | def test__get_n_atoms_envelope_aux_s_n_2_max_length_3():
111 |     element = PeriodicTable().get_element("S")
112 |     me, Me, pe = element.get_abundances()
113 |     n = 2
114 |     max_length = 3
115 |     M, p = ids._get_n_atoms_envelope_aux(me, Me, pe, n, max_length)
116 |     assert np.array_equal(M.round().astype(int), np.array([64, 65, 66]))
117 |     assert np.allclose(np.sum(pe), 1.0)
118 | 
119 | 
120 | def test__get_n_atoms_envelope_aux_s_n_2_max_length_10():
121 |     element = PeriodicTable().get_element("S")
122 |     me, Me, pe = element.get_abundances()
123 |     n = 2
124 |     max_length = 10
125 |     M, p = ids._get_n_atoms_envelope_aux(me, Me, pe, n, max_length)
126 |     M_rounded = np.array([64, 65, 66, 67, 68, 69, 70,  0, 72, 0])
127 |     assert np.array_equal(M.round().astype(int), M_rounded)
128 |     assert np.allclose(np.sum(pe), 1.0)
129 | 
130 | 
131 | def test__get_n_atoms_envelope():
132 |     element = PeriodicTable().get_element("C")
133 |     c12 = element.isotopes[12]
134 |     me, Me, pe = element.get_abundances()
135 |     M, p = ids._get_n_atoms_envelope(c12, 1, 2)
136 |     assert np.allclose(M, Me)
137 |     assert np.allclose(p, pe)
138 | 
139 | 
140 | def test__get_n_atoms_envelope_custom_abundance():
141 |     element = PeriodicTable().get_element("C")
142 |     c12 = element.isotopes[12]
143 |     me, Me, pe = element.get_abundances()
144 |     pe = np.array([0.8, 0.2])
145 |     M, p = ids._get_n_atoms_envelope(c12, 1, 2, p=pe)
146 |     assert np.allclose(M, Me)
147 |     assert np.allclose(p, pe)
148 | 
149 | 
150 | def test__fill_missing_nominal_no_fill():
151 |     # carbon element do not need to feel missing values.
152 |     max_length = 5
153 |     m = np.array([24, 25, 26, 0, 0])
154 |     M = np.array([24.1, 24.2, 24.3, 0, 0])
155 |     p = np.array([0.5, 0.3, 0.2, 0, 0])
156 |     M_fill, p_fill = ids._fill_missing_nominal(m, M, p, max_length)
157 |     assert np.allclose(M_fill, M)
158 |     assert np.allclose(p_fill, p)
159 | 
160 | 
161 | def test__fill_missing_nominal_fill():
162 |     # Cl  does not have an M + 1 isotope and must be filled.
163 |     max_length = 5
164 |     m = np.array([105, 107, 109])
165 |     M = np.array([105.1, 107.2, 109.3])
166 |     p = np.array([0.5, 0.3, 0.2])
167 |     M_fill, p_fill = ids._fill_missing_nominal(m, M, p, max_length)
168 |     M_expected = np.array([M[0], 0, M[1], 0, M[2]])
169 |     p_expected = np.array([p[0], 0, p[1], 0, p[2]])
170 |     assert np.allclose(M_fill, M_expected)
171 |     assert np.allclose(p_fill, p_expected)
172 | 
173 | 
174 | def test__combine_envelopes_one_row_array():
175 |     c12 = PeriodicTable().get_isotope("12C")
176 |     max_length = 10
177 |     n1 = 2
178 |     n2 = 5
179 |     n = n1 + n2
180 |     M1, p1 = ids._get_n_atoms_envelope(c12, n1, max_length)
181 |     M1 = M1.reshape((1, M1.size))
182 |     p1 = p1.reshape((1, p1.size))
183 |     M2, p2 = ids._get_n_atoms_envelope(c12, n2, max_length)
184 |     M2 = M2.reshape((1, M1.size))
185 |     p2 = p2.reshape((1, p1.size))
186 |     M, p = ids.combine_envelopes(M1, p1, M2, p2)
187 |     M_expected, p_expected = ids._get_n_atoms_envelope(c12, n, max_length)
188 |     M_expected = M_expected.reshape((1, M_expected.size))
189 |     p_expected = p_expected.reshape((1, p_expected.size))
190 |     assert np.allclose(M, M_expected)
191 |     assert np.allclose(p, p_expected)
192 | 
193 | 
194 | def test__combine_envelopes_multiple_row_array():
195 |     c12 = PeriodicTable().get_isotope("12C")
196 |     n_rep = 5
197 |     max_length = 10
198 |     n1 = 2
199 |     n2 = 5
200 |     n = n1 + n2
201 |     M1, p1 = ids._get_n_atoms_envelope(c12, n1, max_length)
202 |     M1 = np.tile(M1, (n_rep, 1))
203 |     p1 = np.tile(p1, (n_rep, 1))
204 |     M2, p2 = ids._get_n_atoms_envelope(c12, n2, max_length)
205 |     M2 = np.tile(M2, (n_rep, 1))
206 |     p2 = np.tile(p2, (n_rep, 1))
207 |     M, p = ids.combine_envelopes(M1, p1, M2, p2)
208 |     M_expected, p_expected = ids._get_n_atoms_envelope(c12, n, max_length)
209 |     M_expected = np.tile(M_expected, (n_rep, 1))
210 |     p_expected = np.tile(p_expected, (n_rep, 1))
211 |     assert np.allclose(M, M_expected)
212 |     assert np.allclose(p, p_expected)
213 | 
214 | 
215 | def test_find_formula_abundances():
216 |     f = Formula("CO2")
217 |     max_length = 10
218 |     ids.find_formula_envelope(f.composition, max_length)
219 | 


--------------------------------------------------------------------------------
/tests/unit/test_chem/test_isotope_scorer.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | from tidyms.chem import EnvelopeScorer, EnvelopeValidator
  4 | from tidyms.chem import Formula, get_chnops_bounds
  5 | 
  6 | 
  7 | formula_str_list = ["C11H12N2O2", "C6H12O6", "C27H46O", "CO2", "HCOOH"]
  8 | 
  9 | 
 10 | @pytest.mark.parametrize("f_str", formula_str_list)
 11 | def test_EnvelopeValidator_find_valid_bounds(f_str):
 12 |     max_length = 5
 13 |     bounds = get_chnops_bounds(500)
 14 |     validator = EnvelopeValidator(bounds, max_length=max_length)
 15 |     f = Formula(f_str)
 16 |     M, p = f.get_isotopic_envelope(max_length)
 17 |     tolerance = 0.005
 18 |     validator.generate_envelopes(M, p, tolerance)
 19 |     # results are not strictly equal due to being computed using a subset
 20 |     # of elements in the validator
 21 |     # a tolerance is used to check validity in M and p
 22 |     p_tol = 0.0001
 23 |     M_tol = 0.0000001
 24 |     for k in range(M.size):
 25 |         min_M, max_M, min_p, max_p = validator._find_bounds(k)
 26 |         assert min_M - M_tol < M[k] < max_M + M_tol
 27 |         assert min_p - p_tol < p[k] < max_p + p_tol
 28 | 
 29 | 
 30 | @pytest.mark.parametrize("f_str", formula_str_list)
 31 | def test_EnvelopeValidator_validate(f_str):
 32 |     max_length = 5
 33 |     bounds = get_chnops_bounds(500)
 34 |     validator = EnvelopeValidator(bounds, max_length=max_length)
 35 |     f = Formula(f_str)
 36 |     M, p = f.get_isotopic_envelope(max_length)
 37 |     validated_length = validator.validate(M, p)
 38 |     assert validated_length == max_length
 39 | 
 40 | 
 41 | def test_EnvelopeValidator_validate_invalid_envelope():
 42 |     max_length = 5
 43 |     bounds = get_chnops_bounds(500)
 44 |     validator = EnvelopeValidator(bounds, max_length=max_length)
 45 |     f = Formula("C2H8B")
 46 |     M, p = f.get_isotopic_envelope(max_length)
 47 |     validated_length = validator.validate(M, p)
 48 |     expected_length = 0
 49 |     assert validated_length == expected_length
 50 | 
 51 | 
 52 | @pytest.mark.parametrize("f_str", formula_str_list)
 53 | def test_EnvelopeScorer(f_str):
 54 |     # test that the best scoring candidate has the same molecular formula
 55 |     f = Formula(f_str)
 56 |     max_length = 5
 57 |     bounds = get_chnops_bounds(500)
 58 |     chnops_scorer = EnvelopeScorer(bounds, max_length=max_length)
 59 |     M, p = f.get_isotopic_envelope(max_length)
 60 |     tolerance = 0.005
 61 |     chnops_scorer.score(M, p, tolerance)
 62 |     coeff, isotopes, score = chnops_scorer.get_top_results(5)
 63 |     expected_coeff = [f.composition[x] for x in isotopes]
 64 |     assert np.array_equal(expected_coeff, coeff[0])
 65 | 
 66 | 
 67 | @pytest.mark.parametrize("f_str", formula_str_list)
 68 | def test_EnvelopeScorer_length_gt_scorer_max_length(f_str):
 69 |     # test that the best scoring candidate has the same molecular formula
 70 |     f = Formula(f_str)
 71 |     max_length = 3
 72 |     bounds = get_chnops_bounds(500)
 73 |     chnops_scorer = EnvelopeScorer(bounds, max_length=max_length)
 74 |     M, p = f.get_isotopic_envelope(max_length + 1)
 75 |     tolerance = 0.005
 76 | 
 77 |     with pytest.raises(ValueError):
 78 |         chnops_scorer.score(M, p, tolerance)
 79 |         coeff, isotopes, score = chnops_scorer.get_top_results(5)
 80 |         expected_coeff = [f.composition[x] for x in isotopes]
 81 |         assert np.array_equal(expected_coeff, coeff[0])
 82 | 
 83 | 
 84 | @pytest.mark.parametrize("f_str", formula_str_list)
 85 | def test_EnvelopeScorer_custom_scorer(f_str):
 86 | 
 87 |     def cosine_scorer(mz1, ab1, mz2, ab2, **scorer_params):
 88 |         n1 = np.linalg.norm(ab1)
 89 |         n2 = np.linalg.norm(ab2)
 90 |         norm = np.linalg.norm(ab1 - ab2)
 91 |         cosine = norm / (n1 * n2)
 92 |         return 1 - cosine
 93 | 
 94 |     f = Formula(f_str)
 95 |     max_length = 5
 96 |     M, p = f.get_isotopic_envelope(max_length)
 97 |     bounds = get_chnops_bounds(500)
 98 |     envelope_scorer = EnvelopeScorer(bounds, scorer=cosine_scorer, max_length=max_length)
 99 |     tolerance = 0.005
100 |     envelope_scorer.score(M, p, tolerance)
101 |     coeff, isotopes, score = envelope_scorer.get_top_results(5)
102 |     expected_coeff = [f.composition[x] for x in isotopes]
103 |     assert np.array_equal(expected_coeff, coeff[0])
104 | 
105 | 
106 | @pytest.fixture
107 | def positive_elements_scorer():
108 |     bounds = {"C": (0, 10), "H": (0, 10), "N": (0, 10)}
109 |     return EnvelopeScorer(bounds, max_length=5)
110 | 
111 | 
112 | @pytest.mark.parametrize("f_str", ["C2H3N", "N2H4", "C3N3H3"])
113 | def test_EnvelopeScorer_positive_defect_elements_only(f_str, positive_elements_scorer):
114 |     f = Formula(f_str)
115 |     max_length = positive_elements_scorer.max_length
116 |     M, p = f.get_isotopic_envelope(max_length)
117 |     tolerance = 0.005
118 |     positive_elements_scorer.score(M, p, tolerance)
119 |     coeff, isotopes, score = positive_elements_scorer.get_top_results(5)
120 |     expected_coeff = [f.composition[x] for x in isotopes]
121 |     assert np.array_equal(expected_coeff, coeff[0])
122 | 
123 | 
124 | @pytest.fixture
125 | def negative_elements_scorer():
126 |     bounds = {"C": (0, 10), "O": (0, 10), "S": (0, 10)}
127 |     return EnvelopeScorer(bounds, max_length=5)
128 | 
129 | 
130 | @pytest.mark.parametrize("f_str", ["CS2", "C2OS2", "C3SO"])
131 | def test_EnvelopeScorer_negative_defect_elements_only(f_str, negative_elements_scorer):
132 |     f = Formula(f_str)
133 |     max_length = negative_elements_scorer.max_length
134 |     M, p = f.get_isotopic_envelope(max_length)
135 |     tolerance = 0.001
136 |     negative_elements_scorer.score(M, p, tolerance)
137 |     coeff, isotopes, score = negative_elements_scorer.get_top_results(5)
138 |     expected_coeff = [f.composition[x] for x in isotopes]
139 |     assert np.array_equal(expected_coeff, coeff[0])
140 | 
141 | 
142 | @pytest.fixture
143 | def no_carbon_scorer():
144 |     bounds = {"H": (0, 10), "O": (0, 5), "S": (0, 5), "P": (0, 5)}
145 |     return EnvelopeScorer(bounds, max_length=5)
146 | 
147 | 
148 | @pytest.mark.parametrize("f_str", ["H2O", "H3PO4", "H2SO4"])
149 | def test_EnvelopeScorer_no_carbon(f_str, no_carbon_scorer):
150 |     f = Formula(f_str)
151 |     max_length = no_carbon_scorer.max_length
152 |     M, p = f.get_isotopic_envelope(max_length)
153 |     tolerance = 0.005
154 |     no_carbon_scorer.score(M, p, tolerance)
155 |     coeff, isotopes, score = no_carbon_scorer.get_top_results(5)
156 |     expected_coeff = [f.composition[x] for x in isotopes]
157 |     assert np.array_equal(expected_coeff, coeff[0])
158 | 


--------------------------------------------------------------------------------
/tests/unit/test_consensus_annotation.py:
--------------------------------------------------------------------------------
 1 | from tidyms import consensus_annotation
 2 | from tidyms import _constants as c
 3 | import pandas as pd
 4 | import pytest
 5 | from collections import Counter
 6 | 
 7 | @pytest.fixture
 8 | def feature_table():
 9 |     # Three feature labels, all belonging to the same envelope
10 |     # rows with -1 are noise.
11 |     columns = [c.SAMPLE, c.LABEL, c.ENVELOPE_LABEL, c.ENVELOPE_INDEX, c.CHARGE]
12 |     data = [
13 |         [0, -1, -1, -1, -1],
14 |         [1, -1, -1, -1, -1],
15 |         [2, -1, -1, -1, -1],
16 |         [0, 0, 0, 0, 1],
17 |         [1, 0, 0, 0, 1],
18 |         [2, 0, 0, 0, 1],
19 |         [3, 0, 0, 1, 1],
20 |         [4, 0, 0, 0, 2],
21 |         [0, 1, 0, 1, 1],
22 |         [1, 1, 0, 1, 1],
23 |         [2, 1, 0, 1, 1],
24 |         [3, 1, 0, 2, 1],
25 |         [4, 1, 0, 1, 2],
26 |         [0, 2, 0, 2, 1],
27 |         [1, 2, 0, 2, 1],
28 |         [2, 2, 0, 2, 1],
29 |         [3, 2, 0, 2, 1],
30 |         [4, 2, 0, 2, 2],
31 |     ]
32 |     return pd.DataFrame(data=data, columns=columns)
33 | 
34 | 
35 | def test__build_graph(feature_table):
36 |     graph, annotations = consensus_annotation.vote_annotations(feature_table)
37 |     assert len(annotations) == 3
38 |     for ft_label, ft_data in annotations.items():
39 |         assert ft_data[c.CHARGE] == 1
40 |         assert ft_data[c.ENVELOPE_LABEL] == 0
41 |         assert ft_data[c.ENVELOPE_INDEX] == ft_label
42 | 
43 | 
44 | def test__build_graph_nodes(feature_table):
45 |     nodes = consensus_annotation._build_graph_nodes(feature_table)
46 |     expected = {
47 |         0: {c.CHARGE: 1, c.ENVELOPE_INDEX: 0},
48 |         1: {c.CHARGE: 1, c.ENVELOPE_INDEX: 1},
49 |         2: {c.CHARGE: 1, c.ENVELOPE_INDEX: 2}
50 |     }
51 |     assert nodes == expected
52 | 
53 | def test__build_graph_edges(feature_table):
54 |     edges = consensus_annotation._build_graph_edges(feature_table)
55 |     edge_count = Counter(edges)
56 |     expected = Counter({(0, 1): 4, (0, 2): 4})
57 |     assert edge_count == expected
58 | 


--------------------------------------------------------------------------------
/tests/unit/test_correspondence.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from tidyms import correspondence
  4 | from tidyms import _constants as c
  5 | import pytest
  6 | from sklearn.cluster import DBSCAN
  7 | 
  8 | 
  9 | # test make_initial_cluster
 10 | 
 11 | @pytest.mark.parametrize(
 12 |     "n,k,max_size",
 13 |     [[20, 2, 10], [100, 4, 125], [200, 25, 1500], [200, 10, 20000]]
 14 | )
 15 | def test_make_initial_cluster(n, k, max_size):
 16 |     # n is the number of samples
 17 |     # k is the number of clusters
 18 |     # test with several sample sizes and check that the result is the same
 19 |     # as using DBSCAN without data split
 20 |     X1 = np.arange(n)
 21 |     X2 = np.arange(n)
 22 |     X = np.vstack((X1, X2)).T
 23 |     X = np.repeat(X, k, axis=0)
 24 |     X = np.random.permutation(X)
 25 |     # k cluster, no noise should be present
 26 |     eps = 0.1
 27 |     min_samples = round(n * 0.2)
 28 |     test_cluster = correspondence._cluster_dbscan(
 29 |         X, eps, min_samples, max_size
 30 |     )
 31 |     dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric="chebyshev")
 32 |     dbscan.fit(X)
 33 |     expected_cluster = dbscan.labels_
 34 |     assert np.array_equal(test_cluster, expected_cluster)
 35 | 
 36 | 
 37 | # test estimate n species
 38 | 
 39 | @pytest.mark.parametrize(
 40 |     "min_samples,expected",
 41 |     [[1, np.array([2, 2])], [2, np.array([2, 2])], [3, np.array([0, 0])]])
 42 | def test_estimate_n_species_one_class(min_samples, expected):
 43 |     samples = np.array(
 44 |         [0] * 4 + [1] * 4   # 8 features detected in total in two samples
 45 |     )
 46 |     clusters = np.array(
 47 |         ([0] * 2 + [1] * 2) * 2     # two clusters
 48 |     )
 49 |     n_clusters = 2
 50 |     # two species in two clusters are expected
 51 |     res = correspondence._estimate_n_species_one_class(
 52 |         samples, clusters, min_samples, n_clusters
 53 |     )
 54 |     assert np.array_equal(res, expected)
 55 | 
 56 | 
 57 | def test_estimate_n_species_multiple_groups():
 58 |     samples = np.array(
 59 |         [0] * 4 + [1] * 4 + [2] * 4     # 12 features in three samples
 60 |     )
 61 |     clusters = np.array(
 62 |         ([0] * 2 + [1] * 2) * 3  # two clusters
 63 |     )
 64 |     classes = np.array(
 65 |         [0] * 8 + [1] * 4   # two groups
 66 |     )
 67 |     min_dr = 0.5
 68 |     # two species in two clusters are expected
 69 |     expected = {0: 2, 1: 2}
 70 |     include_classes = [0, 1]
 71 |     samples_per_class = {0: 2, 1: 1}
 72 | 
 73 |     res = correspondence._estimate_n_species(
 74 |         samples, clusters, classes, samples_per_class, include_classes, min_dr)
 75 |     assert res == expected
 76 | 
 77 | 
 78 | # test _get_min_samples
 79 | 
 80 | @pytest.fixture
 81 | def samples_per_class():
 82 |     res = {
 83 |         0: 8,
 84 |         1: 16,
 85 |         2: 24
 86 |     }
 87 |     return res
 88 | 
 89 | 
 90 | def test_get_min_samples_include_classes_none(samples_per_class):
 91 |     min_fraction = 0.25
 92 |     include_classes = None
 93 |     test_min_samples = correspondence._get_min_sample(
 94 |         samples_per_class, include_classes, min_fraction)
 95 |     expected_min_samples = round(sum(samples_per_class.values()) * min_fraction)
 96 |     assert expected_min_samples == test_min_samples
 97 | 
 98 | 
 99 | def test_get_min_samples_include_classes(samples_per_class):
100 |     min_fraction = 0.25
101 |     include_classes = [0, 1]
102 |     test_min_samples = correspondence._get_min_sample(
103 |         samples_per_class, include_classes, min_fraction)
104 |     n_include = [v for k, v in samples_per_class.items()if k in include_classes]
105 |     expected_min_samples = round(min(n_include) * min_fraction)
106 |     assert expected_min_samples == test_min_samples
107 | 
108 | 
109 | def test_process_cluster_one_species():
110 |     np.random.seed(1234)
111 |     # features
112 |     n = 200
113 |     X = np.random.normal(size=(n, 2))
114 |     samples = np.arange(n)
115 | 
116 |     # add noise
117 |     n_noise = 10
118 |     noise = np.random.normal(size=(n_noise, 2), loc=4)
119 |     X = np.vstack((X, noise))
120 |     s_noise = np.random.choice(samples, size=n_noise)
121 |     samples = np.hstack((samples, s_noise))
122 | 
123 |     expected = np.array([0] * n + [-1] * n_noise)
124 | 
125 |     n_species = 1
126 |     max_deviation = 4
127 |     labels, score = correspondence._process_cluster(
128 |         X, samples, n_species, max_deviation)
129 |     assert np.array_equal(labels, expected)
130 | 
131 | 
132 | def test_process_cluster_two_species():
133 |     np.random.seed(1234)
134 |     # features
135 |     n = 200
136 |     x_list = list()
137 |     s_list = list()
138 |     for loc in [0, 4]:
139 |         x_list.append(np.random.normal(size=(n, 2), loc=loc))
140 |         s_list.append(np.arange(n))
141 | 
142 |     # add noise
143 |     n_noise = 10
144 |     x_list.append(np.random.normal(size=(n_noise, 2), loc=8))
145 |     X = np.vstack(x_list)
146 |     s_list.append(np.random.choice(s_list[0], size=n_noise))
147 |     samples = np.hstack(s_list)
148 | 
149 |     expected = np.array([0] * n + [1] * n + [-1] * n_noise)
150 | 
151 |     n_species = 2
152 |     max_deviation = 4
153 |     labels, score = correspondence._process_cluster(
154 |         X, samples, n_species, max_deviation)
155 |     assert np.array_equal(labels, expected)
156 | 
157 | 
158 | def test_match_features():
159 |     np.random.seed(1234)
160 |     # features
161 |     n = 200
162 |     x_list = list()
163 |     s_list = list()
164 |     for loc in [0, 4]:
165 |         x_list.append(np.random.normal(size=(n, 2), loc=loc))
166 |         s_list.append(np.arange(n))
167 | 
168 |     # add noise
169 |     n_noise = 10
170 |     x_list.append(np.random.normal(size=(n_noise, 2), loc=8))
171 |     X = np.vstack(x_list)
172 |     s_list.append(np.random.choice(s_list[0], size=n_noise))
173 |     samples = np.hstack(s_list)
174 | 
175 |     feature_table = pd.DataFrame(X, columns=["mz", "rt"])
176 |     feature_table[c.SAMPLE] = samples
177 |     feature_table[c.CLASS] = 0
178 |     samples_per_class = {0: 200}
179 | 
180 |     expected = np.array([0] * n + [1] * n + [-1] * n_noise)
181 | 
182 |     labels = correspondence.match_features(
183 |         feature_table, samples_per_class, None, 2, 2, 0.25, 4, verbose=True)
184 |     labels = labels[c.LABEL]
185 | 
186 |     assert np.array_equal(labels, expected)
187 | 


--------------------------------------------------------------------------------
/tests/unit/test_fileio.py:
--------------------------------------------------------------------------------
  1 | from tidyms import fileio
  2 | from tidyms.utils import get_tidyms_path
  3 | import os
  4 | import pytest
  5 | 
  6 | 
  7 | def test_read_mzmine():
  8 |     dataset_name = "test-mzmine"
  9 |     cache_path = get_tidyms_path()
 10 |     data_path = os.path.join(cache_path, dataset_name)
 11 |     data_matrix_path = os.path.join(data_path, "data.csv")
 12 |     sample_metadata_path = os.path.join(data_path, "sample.csv")
 13 |     try:
 14 |         fileio.read_mzmine(data_matrix_path, sample_metadata_path)
 15 |     except FileNotFoundError:
 16 |         fileio.download_dataset(dataset_name)
 17 |         fileio.read_mzmine(data_matrix_path, sample_metadata_path)
 18 |     assert True
 19 | 
 20 | 
 21 | def test_read_progenesis():
 22 |     # progenesis data is contained in one file
 23 |     dataset_name = "test-progenesis"
 24 |     cache_path = get_tidyms_path()
 25 |     data_path = os.path.join(cache_path, dataset_name)
 26 |     data_matrix_path = os.path.join(data_path, "data.csv")
 27 |     try:
 28 |         fileio.read_progenesis(data_matrix_path)
 29 |     except FileNotFoundError:
 30 |         fileio.download_dataset(dataset_name)
 31 |         fileio.read_progenesis(data_matrix_path)
 32 |     assert True
 33 | 
 34 | 
 35 | def test_read_xcms():
 36 |     dataset_name = "test-xcms"
 37 |     cache_path = get_tidyms_path()
 38 |     data_path = os.path.join(cache_path, dataset_name)
 39 |     data_matrix_path = os.path.join(data_path, "data.csv")
 40 |     sample_metadata_path = os.path.join(data_path, "sample.csv")
 41 |     feature_metadata_path = os.path.join(data_path, "feature.csv")
 42 |     try:
 43 |         fileio.read_xcms(data_matrix_path, feature_metadata_path,
 44 |                          sample_metadata_path)
 45 |     except FileNotFoundError:
 46 |         fileio.download_dataset(dataset_name)
 47 |         fileio.read_xcms(data_matrix_path, feature_metadata_path,
 48 |                          sample_metadata_path)
 49 |     assert True
 50 | 
 51 | 
 52 | def test_read_compressed_indexed_mzml(centroid_mzml):
 53 |     n_spectra = centroid_mzml.get_n_spectra()
 54 |     n_chromatogram = centroid_mzml.get_n_chromatograms()
 55 | 
 56 |     # test spectra
 57 |     for k in range(n_spectra):
 58 |         centroid_mzml.get_spectrum(k)
 59 | 
 60 |     # test chromatogram
 61 |     for k in range(n_chromatogram):
 62 |         centroid_mzml.get_chromatogram(k)
 63 | 
 64 |     assert True
 65 | 
 66 | 
 67 | def test_read_uncompressed_indexed_mzml():
 68 |     cache_path = get_tidyms_path()
 69 |     filename = "centroid-data-indexed-uncompressed.mzML"
 70 |     data_path = os.path.join(cache_path, "test-raw-data", filename)
 71 |     ms_data = fileio.MSData.create_MSData_instance(data_path)
 72 |     n_spectra = ms_data.get_n_spectra()
 73 |     n_chromatogram = ms_data.get_n_chromatograms()
 74 | 
 75 |     # test spectra
 76 |     for k in range(n_spectra):
 77 |         ms_data.get_spectrum(k)
 78 | 
 79 |     # test chromatogram
 80 |     for k in range(n_chromatogram):
 81 |         ms_data.get_n_chromatograms()
 82 | 
 83 |     assert True
 84 | 
 85 | 
 86 | def test_read_compressed_no_index_mzml():
 87 |     cache_path = get_tidyms_path()
 88 |     filename = "centroid-data-zlib-no-index-compressed.mzML"
 89 |     data_path = os.path.join(cache_path, "test-raw-data", filename)
 90 |     ms_data = fileio.MSData.create_MSData_instance(data_path)
 91 |     n_spectra = ms_data.get_n_spectra()
 92 |     n_chromatogram = ms_data.get_n_chromatograms()
 93 | 
 94 |     # test spectra
 95 |     for k in range(n_spectra):
 96 |         ms_data.get_spectrum(k)
 97 | 
 98 |     # test chromatogram
 99 |     for k in range(n_chromatogram):
100 |         ms_data.get_n_chromatograms()
101 | 
102 |     assert True
103 | 
104 | 
105 | def test_get_spectra_iterator_start(centroid_mzml):
106 |     start = 9
107 |     sp_iterator = centroid_mzml.get_spectra_iterator(start=start)
108 |     for scan, sp in sp_iterator:
109 |         assert scan >= start
110 | 
111 | 
112 | def test_get_spectra_iterator_end(centroid_mzml):
113 |     expected_end = 20
114 |     sp_iterator = centroid_mzml.get_spectra_iterator(end=expected_end)
115 |     for scan, sp in sp_iterator:
116 |         assert scan < expected_end
117 | 
118 | 
119 | def test_get_spectra_iterator_ms_level(centroid_mzml):
120 |     expected_ms_level = 2
121 |     sp_iterator = centroid_mzml.get_spectra_iterator(ms_level=expected_ms_level)
122 |     for scan, sp in sp_iterator:
123 |         assert sp.ms_level == expected_ms_level
124 | 
125 | 
126 | def test_get_spectra_iterator_start_time(centroid_mzml):
127 |     start_time = 10
128 |     sp_iterator = centroid_mzml.get_spectra_iterator(start_time=start_time)
129 |     for scan, sp in sp_iterator:
130 |         assert sp.time >= start_time
131 | 
132 | 
133 | def test_get_spectra_iterator_end_time(centroid_mzml):
134 |     end_time = 20
135 |     sp_iterator = centroid_mzml.get_spectra_iterator(end_time=end_time)
136 |     for scan, sp in sp_iterator:
137 |         assert sp.time < end_time
138 | 
139 | 
140 | def test_centroids(profile_mzml):
141 |     profile_mzml.get_spectrum(0).find_centroids()
142 |     assert True
143 | 
144 | 
145 | def test_load_dataset():
146 |     for d in fileio.list_available_datasets():
147 |         fileio.load_dataset(d)
148 | 
149 | 
150 | def test_load_dataset_invalid_dataset():
151 |     with pytest.raises(ValueError):
152 |         fileio.load_dataset("invalid-dataset")
153 | 


--------------------------------------------------------------------------------
/tests/unit/test_fill_missing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tidyms as ms
 3 | 
 4 | 
 5 | def test_get_fill_area_no_peaks_detected(monkeypatch):
 6 |     time = np.arange(100)
 7 |     spint = np.ones_like(time)
 8 |     chromatogram = ms.Chromatogram(time, spint)
 9 |     rt = 50
10 |     rt_std = 10
11 |     n_dev = 1
12 | 
13 |     def mock_extract_features(self, **kwargs):
14 |         self.features = list()
15 | 
16 |     monkeypatch.setattr(ms.Chromatogram, "extract_features", mock_extract_features)
17 | 
18 |     area = ms.fill_missing._get_fill_area(chromatogram, rt, rt_std, n_dev)
19 |     assert area is None
20 | 
21 | 
22 | def test_get_fill_area_peak_detected_outside_valid_range(monkeypatch):
23 |     time = np.arange(100)
24 |     spint = np.ones_like(time)
25 |     chromatogram = ms.Chromatogram(time, spint)
26 |     rt = 50
27 |     rt_std = 10
28 |     n_dev = 1
29 | 
30 |     def mock_extract_features(self, **kwargs):
31 |         self.features = [ms.lcms.Peak(70, 75, 80, self)]
32 | 
33 |     monkeypatch.setattr(ms.Chromatogram, "extract_features", mock_extract_features)
34 | 
35 |     area = ms.fill_missing._get_fill_area(chromatogram, rt, rt_std, n_dev)
36 |     assert area is None
37 | 
38 | 
39 | def test_get_fill_area_peak_detected_inside_valid_range(monkeypatch):
40 |     time = np.arange(100)
41 |     spint = np.ones_like(time)
42 |     chromatogram = ms.Chromatogram(time, spint)
43 |     chromatogram.baseline = np.zeros_like(time)
44 |     rt = 50
45 |     rt_std = 10
46 |     n_dev = 1
47 |     test_peak = ms.lcms.Peak(50, 55, 60, chromatogram)
48 |     expected_area = test_peak.get_area()
49 | 
50 |     def mock_extract_features(self, **kwargs):
51 |         self.features = [test_peak]
52 | 
53 |     monkeypatch.setattr(ms.Chromatogram, "extract_features", mock_extract_features)
54 | 
55 |     area = ms.fill_missing._get_fill_area(chromatogram, rt, rt_std, n_dev)
56 |     assert np.isclose(area, expected_area)
57 | 
58 | 
59 | def test_get_fill_area_multiple_valid_peaks_choose_closest(monkeypatch):
60 |     time = np.arange(100)
61 |     spint = np.ones_like(time)
62 |     chromatogram = ms.Chromatogram(time, spint)
63 |     chromatogram.baseline = np.zeros_like(time)
64 |     rt = 50
65 |     rt_std = 10
66 |     n_dev = 1
67 |     valid_peak = ms.lcms.Peak(45, 50, 52, chromatogram)
68 |     detected_peaks = [valid_peak, ms.lcms.Peak(55, 60, 65, chromatogram)]
69 |     expected_area = valid_peak.get_area()
70 | 
71 |     def mock_extract_features(self, **kwargs):
72 |         self.features = detected_peaks
73 | 
74 |     monkeypatch.setattr(ms.Chromatogram, "extract_features", mock_extract_features)
75 | 
76 |     area = ms.fill_missing._get_fill_area(chromatogram, rt, rt_std, n_dev)
77 |     assert np.isclose(area, expected_area)
78 | 


--------------------------------------------------------------------------------
/tests/unit/test_filter.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import tidyms as ms
 4 | 
 5 | 
 6 | def test_class_remover(data_container_with_order):
 7 |     rm = ["QC"]
 8 |     data = data_container_with_order
 9 |     n_qc_samples = (data.sample_metadata["class"] == 'QC').sum()
10 |     n_samples = data.data_matrix.shape[0]
11 |     crm = ms.filter.ClassRemover(rm)
12 |     crm.process(data)
13 |     assert data.data_matrix.shape[0] == (n_samples - n_qc_samples)
14 | 
15 | 
16 | def test_class_remover_invalid_class(data_container_with_order):
17 |     rm = ["invalid_class"]
18 |     data = data_container_with_order
19 |     crm = ms.filter.ClassRemover(rm)
20 |     n_samples = data.data_matrix.shape[0]
21 |     crm.process(data)
22 |     assert data.data_matrix.shape[0] == n_samples
23 |     
24 | 
25 | # def test_prevalence_filter_remove_none(data_container_with_order):
26 | #     data = data_container_with_order
27 | #     process_classes = None
28 | #     lb = 0
29 | #     ub = 1
30 | #     intraclass = True
31 | #     threshold = 0
32 | #     pf = ms.filter.PrevalenceFilter(process_classes=process_classes, lb=lb,
33 | #                                     ub=ub, intraclass=intraclass,
34 | #                                     threshold=threshold)
35 | #     pf.process(data)
36 | #     assert True
37 | #
38 | #
39 | # def test_prevalence_filter_remove_one_feature(data_container_with_order):
40 | #     data = data_container_with_order
41 | #     rm_ft = "FT01"
42 | #     data._data_matrix.loc[:, rm_ft] = 0
43 | #     process_classes = None
44 | #     lb = 0.1
45 | #     ub = 1
46 | #     intraclass = True
47 | #     threshold = 0
48 | #     pf = ms.filter.PrevalenceFilter(process_classes=process_classes,
49 | #                                     lb=lb,
50 | #                                     ub=ub,
51 | #                                     intraclass=intraclass,
52 | #                                     threshold=threshold)
53 | #     pf.process(data)
54 | #     assert rm_ft in pf.remove
55 | #
56 | #
57 | # def test_blank_filter_custom_func(data_container_with_order):
58 | #     data = data_container_with_order
59 | #     bc = ms.filter.BlankCorrector(mode=lambda x: 20)
60 | #     bc.process(data)
61 | #     assert (data._data_matrix[data.classes
62 | #             .isin(bc.params["process_classes"])] == 0).all().all()
63 | #
64 | #
65 | # def test_variation_filter(data_container_with_order):
66 | #     data = data_container_with_order
67 | #     vf = ms.filter.VariationFilter(lb=0,
68 | #                                    ub=0.2,
69 | #                                    process_classes=None)
70 | #     vf.process(data)
71 | #     print(vf.remove)
72 | #     assert vf.remove.empty
73 | 


--------------------------------------------------------------------------------
/tests/unit/test_peaks.py:
--------------------------------------------------------------------------------
  1 | import tidyms as ms
  2 | import numpy as np
  3 | import pytest
  4 | from scipy.signal.windows import gaussian
  5 | from scipy.special import erfc
  6 | from scipy.ndimage import gaussian_filter1d
  7 | # from itertools import product
  8 | 
  9 | # random seed
 10 | SEED = 1234
 11 | 
 12 | 
 13 | # noise estimation tests
 14 | 
 15 | @pytest.fixture
 16 | def noise():
 17 |     sigma = 1.0
 18 |     np.random.seed(SEED)
 19 |     return np.random.normal(size=500, scale=sigma), sigma
 20 | 
 21 | 
 22 | def test_estimate_local_noise_empty_signal():
 23 |     x = np.array([])
 24 |     noise = ms.peaks._estimate_local_noise(x)
 25 |     assert np.isclose(noise, 0.0)
 26 | 
 27 | 
 28 | @pytest.mark.parametrize("x", [np.array([1]), np.array([1, 2])])
 29 | def test_estimate_local_noise_signal_length_lower_than_two(x):
 30 |     noise = ms.peaks._estimate_local_noise(x)
 31 |     assert np.isclose(noise, 0.0)
 32 | 
 33 | 
 34 | def test_estimate_local_noise(noise):
 35 |     # check that the noise estimation is close to the std of a normal
 36 |     # distribution
 37 |     x, sigma = noise
 38 |     noise_estimation = ms.peaks._estimate_local_noise(x)
 39 |     # noise should be close to sigma, check with a 20 % tolerance
 40 |     assert (sigma < 1.2 * noise_estimation)
 41 | 
 42 | 
 43 | def test_estimate_local_noise_non_robust(noise):
 44 |     x, sigma = noise
 45 |     noise_estimation = ms.peaks._estimate_local_noise(x, robust=False)
 46 |     # noise should be close to sigma, check with a 20 % tolerance
 47 |     assert (sigma < 1.2 * noise_estimation)
 48 | 
 49 | 
 50 | def test_estimate_noise_empty_array():
 51 |     x = np.array([])
 52 |     noise = ms.peaks.estimate_noise(x)
 53 |     assert noise.size == 0.0
 54 | 
 55 | 
 56 | @pytest.mark.parametrize("x", [np.array([1]), np.array([1, 3]),
 57 |                                np.array([1, 4, 6])])
 58 | def test_estimate_noise_signal_length_lower_than_two(x):
 59 |     noise_estimation = ms.peaks.estimate_noise(x)
 60 |     assert np.allclose(noise_estimation, 0.0)
 61 | 
 62 | 
 63 | def test_estimate_noise_check_size(noise):
 64 |     noise, sigma = noise
 65 |     noise_estimation = ms.peaks.estimate_noise(noise, n_slices=2)
 66 |     assert noise.size == noise_estimation.size
 67 | 
 68 | 
 69 | def test_estimate_noise_n_slices(noise):
 70 |     noise, sigma = noise
 71 |     noise_estimation = ms.peaks.estimate_noise(noise, n_slices=2)
 72 |     size = noise.size
 73 |     half = size // 2
 74 |     # check that the noise estimation was done for 2 slices
 75 |     assert np.allclose(noise_estimation[:half], noise_estimation[0])
 76 |     assert np.allclose(noise_estimation[half:], noise_estimation[half])
 77 |     # check that the estimation on each slice is different
 78 |     assert noise_estimation[0] != noise_estimation[half]
 79 | 
 80 | 
 81 | def test_estimate_noise_min_slice_size(noise):
 82 |     noise, sigma = noise
 83 |     n_slices = 5
 84 |     min_slice_size = 150
 85 |     noise_estimation = ms.peaks.estimate_noise(noise, n_slices=n_slices,
 86 |                                                min_slice_size=min_slice_size)
 87 |     # noise has a size of 500, the slice is going to be 100 < 150
 88 |     # check that 150 is used instead.
 89 |     slice_boundaries = [0, 150, 300, 500]   # the last slice is extended to 200
 90 |     # to prevent the creation of a slice of size 50
 91 |     for k in range(len(slice_boundaries) - 1):
 92 |         start = slice_boundaries[k]
 93 |         end = slice_boundaries[k + 1]
 94 |         assert np.allclose(noise_estimation[start:end], noise_estimation[start])
 95 | 
 96 | 
 97 | # Test baseline estimation
 98 | 
 99 | def test_find_local_extrema():
100 |     x = np.arange(10)
101 |     # reflect and merge the concatenate x. local extrema should be 0, 9, 19
102 |     x = np.hstack((x, x[::-1]))
103 |     test_output = ms.peaks._find_local_extrema(x)
104 |     expected_output = [0, 9, 19]
105 |     assert np.array_equal(test_output, expected_output)
106 | 
107 | 
108 | def test_find_local_extrema_no_local_maximum():
109 |     x = np.arange(10)
110 |     test_output = ms.peaks._find_local_extrema(x)
111 |     expected_output = np.array([])
112 |     assert np.array_equal(test_output, expected_output)
113 | 
114 | 
115 | test_noise_sum_params = [[np.array([0, 1]), np.sqrt([25, 25])],
116 |                          [np.array([0]), np.sqrt([34])]]
117 | 
118 | 
119 | @pytest.mark.parametrize("index,expected", test_noise_sum_params)
120 | def test_get_noise_sum_slice_std(index, expected):
121 |     index = np.array(index)
122 |     expected = np.array(expected)
123 |     x = np.array([3, 4, 2, 2, 1])
124 |     test_output = ms.peaks._get_noise_slice_sum_std(x, index)
125 |     assert np.allclose(test_output, expected)
126 | 
127 | 
128 | def test_estimate_noise_probability():
129 |     noise = np.ones(7)
130 |     x = np.array([0, 0.1, 0.4, 2, 1.25, 1.1, 1.0])
131 |     extrema = np.array([0, 3, 6])
132 |     # two slices of size 4 and 2 respectively, the expected output should
133 |     # be erfc(1/sqrt(2) and erfc(1)
134 |     expected_output = erfc([2.5 * np.sqrt(1 / 2) / 2,
135 |                             1.35 * np.sqrt(1 / 2) / 2])
136 |     test_output = ms.peaks._estimate_noise_probability(noise, x, extrema)
137 |     assert np.allclose(expected_output, test_output)
138 | 
139 | 
140 | def test_build_baseline_index():
141 |     x = np.array([0, 1, 2, 1, 0, 1, 2, 1, 0, 1, 2, 1, 0])
142 |     extrema = np.array([0, 2, 4, 6, 8, 10, 12])
143 |     noise_probability = np.array([0, 0.25, 0.25, 0.25, 0, 0])
144 |     min_proba = 0.05
145 |     expected = np.array([0, 4, 5, 6, 12])
146 |     test = ms.peaks._build_baseline_index(x, noise_probability, min_proba,
147 |                                           extrema)
148 |     assert np.array_equal(expected, test)
149 | 
150 | 
151 | def test_estimate_baseline():
152 |     # a simple test, a noise array is built using a noise level greater
153 |     # than the noise level in the signal. All points should be classified as
154 |     # baseline
155 |     n = 100
156 |     x = np.random.normal(size=n, scale=1)
157 |     noise = np.ones(n) * 5
158 |     baseline = ms.peaks.estimate_baseline(x, noise)
159 |     expected_baseline_index = np.arange(n)
160 |     test_baseline_index = np.where(np.abs(x - baseline) < noise)[0]
161 |     assert np.array_equal(expected_baseline_index, test_baseline_index)
162 | 
163 | 
164 | @pytest.fixture
165 | def single_peak(noise):
166 |     noise, sigma = noise
167 |     x = gaussian(noise.size, 2) * 20
168 |     return x
169 | 
170 | 
171 | @pytest.fixture
172 | def two_non_overlapping_peaks(noise):
173 |     noise, sigma = noise
174 |     x = np.arange(noise.size)
175 |     params = np.array([[100, 2, 50], [150, 2, 25]])
176 |     y = ms.utils.gaussian_mixture(x, params).sum(axis=0)
177 |     return y, params
178 | 
179 | 
180 | def test_detect_peaks_one_peak(single_peak, noise):
181 |     noise, sigma = noise
182 |     x = single_peak + noise
183 |     noise_estimation = ms.peaks.estimate_noise(x)
184 |     # smooth x to reduce the number of detected peaks
185 |     x = gaussian_filter1d(x, 1.0)
186 |     baseline_estimation = ms.peaks.estimate_baseline(x, noise)
187 |     peaks = ms.peaks.detect_peaks(x, noise_estimation, baseline_estimation)
188 |     assert len(peaks[0]) == 1
189 | 
190 | 
191 | def test_detect_peaks_two_non_overlapping_peaks(two_non_overlapping_peaks,
192 |                                                 noise):
193 |     noise, sigma = noise
194 |     x, _ = two_non_overlapping_peaks
195 |     x = x + noise
196 |     noise_estimation = ms.peaks.estimate_noise(x)
197 |     # smooth x to reduce the number of detected peaks
198 |     x = gaussian_filter1d(x, 1.0)
199 |     baseline_estimation = ms.peaks.estimate_baseline(x, noise)
200 |     peaks = ms.peaks.detect_peaks(x, noise_estimation, baseline_estimation)
201 |     assert len(peaks[0]) == 2
202 | 
203 | 
204 | @pytest.fixture
205 | def two_overlapping_peaks(noise):
206 |     noise, sigma = noise
207 |     x = np.arange(noise.size)
208 |     params = np.array([[100, 2, 50], [108, 2, 25]])
209 |     y = ms.utils.gaussian_mixture(x, params).sum(axis=0)
210 |     return y, params
211 | 
212 | 
213 | def test_detect_peaks_two_overlapping_peaks(two_overlapping_peaks, noise):
214 |     noise, sigma = noise
215 |     x, _ = two_overlapping_peaks
216 |     x = x + noise
217 |     noise_estimation = ms.peaks.estimate_noise(x)
218 |     # smooth x to reduce the number of detected peaks
219 |     x = gaussian_filter1d(x, 1.0)
220 |     baseline_estimation = ms.peaks.estimate_baseline(x, noise)
221 |     peaks = ms.peaks.detect_peaks(x, noise_estimation, baseline_estimation)
222 |     start, apex, end = peaks
223 |     # only two peaks are detected
224 |     assert len(start) == 2
225 |     # check the boundary of the overlapping peaks
226 |     assert end[0] == (start[1] + 1)
227 | 


--------------------------------------------------------------------------------
/tests/unit/test_validation.py:
--------------------------------------------------------------------------------
 1 | from tidyms.validation import *
 2 | import pytest
 3 | 
 4 | 
 5 | @pytest.fixture
 6 | def example_validator():
 7 |     schema = {
 8 |         "positive_number": {"is_positive": True},
 9 |         "a": {"lower_than": "b"},
10 |         "b": {"lower_or_equal": "c"},
11 |         "c": {"type": "number"},
12 |         "some_function": {"check_with": is_callable}
13 |     }
14 |     return ValidatorWithLowerThan(schema)
15 | 
16 | 
17 | def test_is_positive_positive_number(example_validator):
18 |     params = {"positive_number": 5}
19 |     validate(params, example_validator)
20 |     assert True
21 | 
22 | 
23 | def test_is_positive_zero(example_validator):
24 |     params = {"positive_number": 0}
25 |     with pytest.raises(ValueError):
26 |         validate(params, example_validator)
27 | 
28 | 
29 | def test_is_positive_negative_number(example_validator):
30 |     params = {"positive_number": -1}
31 |     with pytest.raises(ValueError):
32 |         validate(params, example_validator)
33 | 
34 | 
35 | def test_lower_than_valid(example_validator):
36 |     # a must be lower than b
37 |     params = {"a": 5, "b": 6}
38 |     validate(params, example_validator)
39 |     assert True
40 | 
41 | 
42 | def test_lower_than_invalid(example_validator):
43 |     # a must be lower than b
44 |     params = {"a": 5, "b": 4}
45 |     with pytest.raises(ValueError):
46 |         validate(params, example_validator)
47 | 
48 | 
49 | def test_lower_than_invalid_equal(example_validator):
50 |     # a must be lower than b
51 |     params = {"a": 5, "b": 5}
52 |     with pytest.raises(ValueError):
53 |         validate(params, example_validator)
54 | 
55 | 
56 | def test_lower_or_equal_valid(example_validator):
57 |     # a must be lower than b
58 |     params = {"b": 5, "c": 7}
59 |     validate(params, example_validator)
60 |     assert True
61 | 
62 | 
63 | def test_lower_or_equal_valid_equal(example_validator):
64 |     # a must be lower than b
65 |     params = {"b": 5, "c": 5}
66 |     validate(params, example_validator)
67 |     assert True
68 | 
69 | 
70 | def test_lower_or_equal_invalid(example_validator):
71 |     # a must be lower than b
72 |     params = {"b": 5, "c": 4}
73 |     with pytest.raises(ValueError):
74 |         validate(params, example_validator)
75 | 
76 | 
77 | def test_is_callable_valid(example_validator):
78 |     # a must be lower than b
79 |     params = {"some_function": sum}
80 |     validate(params, example_validator)
81 |     assert True
82 | 
83 | 
84 | def test_is_callable_invalid(example_validator):
85 |     # a must be lower than b
86 |     params = {"some_function": "invalid_value"}
87 |     with pytest.raises(ValueError):
88 |         validate(params, example_validator)
89 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist =
3 |   python3.9,python3.10
4 | 
5 | [testenv]
6 | deps= -rtest_requirements.txt
7 | commands=pytest


--------------------------------------------------------------------------------