├── .github
    └── workflows
    │   └── python-package.yml
├── .gitignore
├── LICENSE
├── README.md
├── environment.yml
├── examples
    └── baseline_correction_examples.py
├── requirements.txt
├── setup.cfg
├── setup.py
├── src
    └── pyPreprocessing
    │   ├── __init__.py
    │   ├── baseline_correction.py
    │   ├── smoothing.py
    │   └── transform.py
└── tests
    ├── __init__.py
    ├── test_baseline_correction.py
    ├── test_smoothing.py
    └── test_transform.py


/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: build
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "master" ]
 9 |   pull_request:
10 |     branches: [ "master" ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         python-version: ["3.9", "3.10", "3.11"]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v4
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v3
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         python -m pip install flake8 pytest pytest-cov
31 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 |     - name: Lint with flake8
33 |       run: |
34 |         # stop the build if there are Python syntax errors or undefined names
35 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 |     - name: Test with pytest
39 |       run: |
40 |         pytest --cov=./ --cov-report=xml
41 |     - name: Upload coverage to Codecov
42 |       uses: codecov/codecov-action@v5
43 |       with:
44 |         token: ${{ secrets.CODECOV_TOKEN }}
45 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | to_do.txt
3 | .pylint.d
4 | /dist/
5 | /src/*.egg-info


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Alexander Southan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 2 | [![build workflow](https://github.com/AlexanderSouthan/pyPreprocessing/actions/workflows/python-package.yml/badge.svg)](https://github.com/AlexanderSouthan/pyPreprocessing/actions/workflows/python-package.yml)
 3 | [![codecov](https://codecov.io/gh/AlexanderSouthan/pyPreprocessing/branch/master/graph/badge.svg?token=7GN1K2MVJ3)](https://codecov.io/gh/AlexanderSouthan/pyPreprocessing)
 4 | 
 5 | # pyPreprocessing
 6 | ## General information
 7 | For preprocessing of datasets like Raman spectra, infrared spectra, UV/Vis
 8 | spectra, but also HPLC data and many other types of data, currently via
 9 | baseline correction, smoothing, filtering, transformation, normalization and
10 | derivative. It relies on numpy, pandas, scipy, tqdm and scikit-learn, but also
11 | on https://github.com/AlexanderSouthan/pyDataFitting for the introduction of
12 | equality constraints into the polynomial baseline estimation methods, and
13 | https://github.com/AlexanderSouthan/little_helpers.
14 | 
15 | ## Documentation
16 | Please visit:
17 | https://alexandersouthan.github.io/pyPreprocessing/
18 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: pyPreprocessing 
 2 | dependencies:
 3 |   - numpy
 4 |   - pandas
 5 |   - matplotlib
 6 |   - scikit-learn
 7 |   - scipy
 8 |   - tqdm
 9 |   - pip
10 |   - pip:
11 |     - pyDataFitting
12 |     - little_helpers
13 | 


--------------------------------------------------------------------------------
/examples/baseline_correction_examples.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | from pyPreprocessing.baseline_correction import generate_baseline
  9 | from pyPreprocessing.smoothing import smoothing
 10 | from little_helpers.math_functions import gaussian
 11 | from little_helpers.num_derive import derivative
 12 | 
 13 | 
 14 | def simulate_spectrum(peak_centers, peak_amplitudes, peak_widths,
 15 |                       baseline_type='polynomial', baseline_parameters=[1],
 16 |                       noise_level=1, wn_start=0, wn_end=1000,
 17 |                       data_points=1000):
 18 |     """
 19 |     Calculate spectrum with Gaussian peaks.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     peak_centers : list of float
 24 |         The peak centers.
 25 |     peak_amplitudes : list of float
 26 |         The peak amplitudes, i.e. the maximum value of the peak.
 27 |     peak_widths : list of float
 28 |         The sigma of the Gaussian paeks.
 29 |     baseline_type : str, optional
 30 |         The baseline type, currently only 'polynomial' using the calc_function
 31 |         polynomial calculation. The default is 'polynomial'.
 32 |     baseline_parameters : list of float, optional
 33 |         Parameters passed to calc_function. The default is [1], resulting in a
 34 |         constant baseline with a value of 1 in case of baseline_type is
 35 |         'polynomial'.
 36 |     noise_level : float, optional
 37 |         The maximum level of the noise. The default is 1.
 38 |     wn_start : float, optional
 39 |         The start wavenumber used for spectrum calculation. The default is 0.
 40 |     wn_end : float, optional
 41 |         The end wavenumber used for spectrum calculation. The default is 1000.
 42 |     data_points : int, optional
 43 |         The number of evenly spaced data points between wn_start and wn_end
 44 |         used for spectrum calculation. The default is 1000.
 45 | 
 46 |     Returns
 47 |     -------
 48 |     ndarray
 49 |         2D array with the wavenumbers and the intensities.
 50 | 
 51 |     """
 52 |     # Calculate wavennumbers
 53 |     wavenumbers = np.linspace(wn_start, wn_end, num=data_points)
 54 | 
 55 |     # Pass Gaussian paramters to gaussian for pure
 56 |     # spectrum intensities without noise and baseline contributions
 57 |     pure_intensities = gaussian(wavenumbers, peak_amplitudes, peak_centers,
 58 |                                 np.zeros_like(peak_amplitudes), peak_widths)
 59 | 
 60 |     # Calculate noise as random Gaussian noise
 61 |     rng = np.random.default_rng()
 62 |     noise = rng.standard_normal(len(pure_intensities)) * noise_level
 63 | 
 64 |     # Calculate baseline
 65 |     if baseline_type == 'polynomial':
 66 |         baseline = np.polynomial.polynomial.polyval(wavenumbers, baseline_parameters)
 67 |     else:
 68 |         baseline = np.zeros_like(pure_intensities)
 69 | 
 70 |     # Calculate spectrum intensities as the sum of pure intensities, noise and
 71 |     # baseline contribution
 72 |     intensities = pure_intensities + noise + baseline
 73 | 
 74 |     return np.array([wavenumbers, intensities])
 75 | 
 76 | 
 77 | spectrum = simulate_spectrum([200, 250, 500], [10, 5, 20], [10, 40, 5],
 78 |                              baseline_parameters=[5, 0.01, 0.0003], noise_level=1)
 79 | spectrum_clean = simulate_spectrum([200, 250, 500], [10, 5, 20], [10, 40, 5],
 80 |                              baseline_parameters=[0], noise_level=0)
 81 | 
 82 | smoothed_spectrum = smoothing(spectrum[1][np.newaxis], 'sav_gol', savgol_points=10, savgol_order=9)
 83 | derived_spectrum = derivative(spectrum[0], smoothed_spectrum)
 84 | derived_spectrum_2 = derivative(spectrum[0], smoothed_spectrum, order=2)
 85 | 
 86 | # baseline_ALSS = np.squeeze(
 87 | #     generate_baseline(
 88 | #         spectrum[1][np.newaxis], 'ALSS', smoothing=True))
 89 | # baseline_iALSS = np.squeeze(
 90 | #     generate_baseline(
 91 | #         spectrum[1][np.newaxis], 'iALSS', smoothing=True))
 92 | # baseline_drPLS = np.squeeze(
 93 | #     generate_baseline(
 94 | #         spectrum[1][np.newaxis], 'drPLS', smoothing=True))
 95 | # baseline_SNIP = np.squeeze(
 96 | #     generate_baseline(
 97 | #         spectrum[1][np.newaxis], 'SNIP', smoothing=True, transform=False))
 98 | # baseline_ModPoly = np.squeeze(
 99 | #     generate_baseline(
100 | #         spectrum[1][np.newaxis], 'ModPoly', smoothing=True,
101 | #         wavenumbers=spectrum[0]))
102 | baseline_IModPoly = np.squeeze(
103 |     generate_baseline(
104 |         spectrum[1][np.newaxis], 'IModPoly', smoothing=True,
105 |         wavenumbers=spectrum[0], poly_order=3))
106 | # baseline_convex_hull = np.squeeze(
107 | #     generate_baseline(
108 | #         spectrum[1][np.newaxis], 'convex_hull', smoothing=True,
109 | #         wavenumbers=spectrum[0]))
110 | baseline_PPF = np.squeeze(
111 |     generate_baseline(
112 |         spectrum[1][np.newaxis], 'PPF', smoothing=True,
113 |         wavenumbers=spectrum[0], segment_borders=[500], poly_orders=[1, 1],
114 |         y_at_borders=[80]))
115 | 
116 | plt.figure()
117 | plt.plot(spectrum[0], spectrum[1])
118 | plt.plot(spectrum[0], smoothed_spectrum.T)
119 | # plt.plot(spectrum[0], baseline_ALSS, label='ALSS')
120 | # plt.plot(spectrum[0], baseline_iALSS, label='iALLS')
121 | # plt.plot(spectrum[0], baseline_drPLS, label='drPLS')
122 | # plt.plot(spectrum[0], baseline_SNIP, label='SNIP')
123 | # plt.plot(spectrum[0], baseline_ModPoly, label='ModPoly')
124 | plt.plot(spectrum[0], baseline_IModPoly, label='IModPoly')
125 | # plt.plot(spectrum[0], baseline_convex_hull, label='Convex hull')
126 | plt.plot(spectrum[0], baseline_PPF, label='PPF')
127 | # # plt.plot(spectrum[0], np.squeeze(smoothing(spectrum[1][np.newaxis], 'sav_gol', savgol_points=19)))
128 | plt.legend()
129 | 
130 | plt.figure()
131 | plt.plot(spectrum[0], np.squeeze(derived_spectrum))
132 | 
133 | plt.figure()
134 | plt.plot(spectrum[0], np.squeeze(derived_spectrum_2))
135 | 
136 | plt.figure()
137 | plt.plot(spectrum[0], spectrum[1]-baseline_IModPoly.T)
138 | plt.plot(spectrum[0], spectrum_clean[1])
139 | 
140 | std_rolling = pd.Series(spectrum[1]).rolling(25, center=True).std()
141 | mean_rolling = pd.Series(spectrum[1]).rolling(25, center=True).mean()
142 | 
143 | plt.figure()
144 | plt.plot(spectrum[0], std_rolling*mean_rolling)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | matplotlib
4 | scikit-learn
5 | scipy
6 | tqdm
7 | pyDataFitting
8 | little_helpers
9 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = pyPreprocessing
 3 | version = 0.0.2
 4 | author = Alexander Southan
 5 | author_email = alexander.southan@web.de
 6 | description = package preprocessing of datasets, especially from spectroscopy
 7 | long_description = file: README.md
 8 | long_description_content_type = text/markdown
 9 | url = https://github.com/AlexanderSouthan/pyPreprocessing
10 | project_urls =
11 |     Bug Tracker = https://github.com/AlexanderSouthan/pyPreprocessing/issues
12 | classifiers =
13 |     Programming Language :: Python :: 3
14 |     License :: OSI Approved :: MIT License
15 |     Operating System :: OS Independent
16 | 
17 | [options]
18 | package_dir =
19 |     = src
20 | packages = find:
21 | python_requires = >=3.6
22 | 
23 | [options.packages.find]
24 | where = src
25 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='pyPreprocessing',
 5 |     version='0.0.2',
 6 |     packages=find_packages(where='src'),
 7 |     install_requires=['numpy', 'pandas', 'scipy', 'matplotlib', 'scikit-learn', 'tqdm'],
 8 |     dependency_links=['http://github.com/user/repo/tarball/master#egg=package-1.0']
 9 | )
10 | 


--------------------------------------------------------------------------------
/src/pyPreprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | from . import *
2 | 


--------------------------------------------------------------------------------
/src/pyPreprocessing/baseline_correction.py:
--------------------------------------------------------------------------------
  1 | ﻿# -*- coding: utf-8 -*-
  2 | """
  3 | Provides functions correct_baseline and generate_baseline which can be used for
  4 | baseline preprocessing of spectral data. See function docstrings for more
  5 | detail.
  6 | """
  7 | 
  8 | import numpy as np
  9 | from tqdm import tqdm
 10 | from scipy.sparse import diags
 11 | from scipy.sparse.linalg import spsolve
 12 | from scipy.spatial import ConvexHull
 13 | 
 14 | from pyDataFitting.polynomial_regression import (polynomial_fit,
 15 |                                                  piecewise_polynomial_fit)
 16 | from little_helpers.math_functions import piecewise_polynomial
 17 | from .transform import transform as transform_spectra
 18 | from .smoothing import smoothing as smooth_spectra
 19 | from little_helpers.array_tools import y_at_x
 20 | 
 21 | 
 22 | def correct_baseline(raw_data, mode, smoothing=True, transform=False,
 23 |                      **kwargs):
 24 |     """
 25 |     Subtract baseline data from raw_data.
 26 | 
 27 |     Baseline data is either given explicitly with mode='direct', or is
 28 |     calculated with generate_baseline(...). In the former case, it has to have
 29 |     the same number of data points like raw_data has per dataset, or it has to
 30 |     have the same shape like raw_data. In the latter case the function takes
 31 |     the same arguments like generate_baseline, for details see docstring of
 32 |     generate_baseline.
 33 |     """
 34 |     # baseline_data = np.array([])
 35 |     if mode == 'direct':
 36 |         baseline_data = kwargs.get('baseline_data')
 37 |     else:
 38 |         baseline_data = generate_baseline(raw_data, mode, smoothing=smoothing,
 39 |                                      transform=transform, **kwargs)
 40 |     return raw_data - baseline_data
 41 | 
 42 | 
 43 | def generate_baseline(raw_data, mode, smoothing=True, transform=False,
 44 |                       **kwargs):
 45 |     """
 46 |     Calculate baseline data on input datasets with different algorithms.
 47 | 
 48 |     Input data:
 49 |     -----------
 50 |     raw_data: ndarray
 51 |         Numpy 2D array of shape (N, M) with N datasets and M data points per
 52 |         dataset. If only one dataset is given, it has to have the shape (1, M).
 53 |     mode: str
 54 |         Algorithm for baseline calculation. Allowed values:
 55 |         'convex_hull', 'ALSS', 'iALSS', 'drPLS', 'SNIP', 'ModPoly', 'IModPoly',
 56 |         'PPF', 'from_measurement'.
 57 |     smoothing: bool
 58 |         True if datasets should be smoothed before calculation (recommended),
 59 |         otherwise False.
 60 |     transform: bool
 61 |         True if datasets should be transformed before calculation,
 62 |         otherwise False.
 63 | 
 64 |     kwargs for smoothing == True
 65 |     ---------------------------
 66 |     savgol_window: int
 67 |         window size for Savitzky-Golay window, default=19.
 68 |     savgol_order: int
 69 |         polynomial order for Savitzky-Golay filter, default=2.
 70 | 
 71 |     kwargs for transform == True
 72 |     ---------------------------
 73 |     currently none, but will to be added in future versions.
 74 | 
 75 |     kwargs for different baseline modes:
 76 |     ------------------------------------
 77 |     convex_hull:
 78 |         wavenumbers: ndarray
 79 |             Numpy array containing wavenumbers or wavelengths of datasets.
 80 |             Must have M elements and must be sorted. default=np.arange(M)
 81 |     ALSS:
 82 |         lam: float
 83 |             default=10000
 84 |         p: float
 85 |             default=0.001
 86 |         n_iter: int
 87 |             default=10
 88 |         conv_crit: float
 89 |             default=0.001
 90 |     iALSS:
 91 |         lam: float
 92 |             default=2000
 93 |         lam_1: float
 94 |             default=0.01
 95 |         p: float
 96 |             default=0.01
 97 |         n_iter: int
 98 |             default=10
 99 |         conv_crit: float
100 |             default=0.001
101 |         wavenumbers: ndarray
102 |             Numpy array containing wavenumbers or wavelengths of datasets.
103 |             Must have M elements. default=np.arange(M)
104 |     drPLS:
105 |         lam: float
106 |             default=1000000
107 |         eta: float
108 |             default=0.5
109 |         n_iter: int
110 |             default=100
111 |         conv_crit: float
112 |             default=0.001
113 |     SNIP:
114 |         n_iter: int
115 |             default=100
116 |     ModPoly, IModPoly:
117 |         wavenumbers: ndarray
118 |             Numpy array containing wavenumbers or wavelengths of datasets.
119 |             Must have M elements and must be sorted. default=np.arange(M)
120 |         n_iter: int
121 |             default=100
122 |         poly_order: int
123 |             default=5
124 |         fixed_points: list of tuples, optional
125 |             Contains constraints for points that the baseline must
126 |             pass through. Each point is given by a tuple of two numbers,
127 |             the wavenumber and the intensity of the point. If no point
128 |             constraints are to be applied, this must be None. The
129 |             default is None.
130 |         fixed_slopes: list of tuples, optional
131 |             Contains constraints for slopes that the fit functions must
132 |             have at specific wavenumbers. Each slope is given by a tuple of
133 |             two numbers, the wavenumber and the slope. If no slope
134 |             constraints are to be applied, this must be None. The
135 |             default is None.
136 |     PPF:
137 |         wavenumbers: ndarray, optional
138 |             Numpy array containing wavenumbers or wavelengths of datasets.
139 |             Must have M elements and must be sorted. default=np.arange(M).
140 |         n_iter: int, optional
141 |             default=100
142 |         segment_borders : list of int or float, optional
143 |             The values with respect to wavenumbers at which the data is divided
144 |             into segments. An arbitrary number of segment borders may be given,
145 |             but it is recommended to provide a sorted list in order to avoid
146 |             confusion. If the list is not sorted, it will be sorted. The
147 |             default is [wavenumbers[len(wavenumbers)//2]], resulting in a
148 |             segmentation in the middle of the data.
149 |         poly_orders : list of int, optional
150 |             A list containing the polynomial orders used for the baseline fit.
151 |             Must contain one more element than segment_borders. Default is
152 |             [3, 3].
153 |         fit_method: str, optional
154 |             Defines if the polynomial baseline fit of the segments is
155 |             performed by the ModPoly ('ModPoly') or IModPoly ('IModPoly')
156 |             algorithm. Default is 'ModPoly'.
157 |         y_at_borders : None, or list of float or None, or 'int_at_borders', 
158 |         optional
159 |             May contain dependent variable values used as equality constraints
160 |             at the segment borders. The fits of both touching segments are
161 |             forced through the point given by the pair (segment border,
162 |             y_at_border). The list entries may also be None to state that at a
163 |             certain segment border, no constraint is to be applied. The default
164 |             is 'int_at_borders' which is the intensity value at the
165 |             segment_borders.
166 |     from_measurement:
167 |         measured_baseline : ndarray, optional
168 |             The array contains known baseline data obtained through a
169 |             measurement valid for all datasets in raw_data. The length of the
170 |             array has to match the number of M data points in raw_data. The
171 |             default is a zero filled array, so this is only meaningful if
172 |             some baseline data is passed. Otherwise a zero baseline is
173 |             returned. The returned array has the same shape like raw_data and
174 |             can be subtracted from it directly. This is probably not so
175 |             terribly useful.
176 |     """
177 |     # Optionally, spectrum data is smoothed before beaseline calculation. This
178 |     # makes sense especially for baseline generation methods that have problems
179 |     # with noise. Currently Savitzky-Golay only.
180 |     if smoothing:
181 |         savgol_window = kwargs.get('savgol_window', 9)
182 |         savgol_order = kwargs.get('savol_order', 2)
183 |         raw_data = smooth_spectra(raw_data, 'sav_gol',
184 |                                   savgol_points=savgol_window,
185 |                                   poly_order=savgol_order)
186 | 
187 |     # Transformation makes sense for spectra that cover a broad range of peak
188 |     # intensities. Otherwise, small peaks may be more or less ignored during
189 |     # baseline calculation. Currently LLS transformation only.
190 |     if transform:
191 |         spectra_minimum_value = raw_data.min()
192 |         raw_data = transform_spectra(raw_data, 'log_log_sqrt')
193 | 
194 |     # wavenumbers are used for convex_hull, ModPoly, IModPoly, PPF, iALSS
195 |     if 'wavenumbers' in kwargs:
196 |         wavenumbers = kwargs.get('wavenumbers')
197 |         ascending_wn = (wavenumbers[1]-wavenumbers[0]) > 0
198 |     else:
199 |         wavenumbers = np.arange(raw_data.shape[1])
200 |         ascending_wn = True
201 | 
202 |     baseline_data = np.zeros_like(raw_data)
203 |     baseline_modes = ['convex_hull', 'ALSS', 'iALSS', 'drPLS', 'SNIP',
204 |                       'ModPoly', 'IModPoly', 'PPF', 'from_measurement']
205 | 
206 |     if mode == baseline_modes[0]:  # convex_hull
207 |         # based on (but improved a bit)
208 |         # https://dsp.stackexchange.com/questions/2725/
209 |         # how-to-perform-a-rubberband-correction-on-spectroscopic-data
210 | 
211 |         if ascending_wn:
212 |             raw_data = np.flip(raw_data, axis=1)
213 |             wavenumbers = np.flip(wavenumbers)
214 | 
215 |         for ii, current_spectrum in enumerate(tqdm(raw_data)):
216 |             hull_vertices = ConvexHull(
217 |                 np.array(list(zip(wavenumbers, current_spectrum)))).vertices
218 | 
219 |             # Rotate convex hull vertices until they start from the lowest one
220 |             hull_vertices = np.roll(hull_vertices, -np.argmin(hull_vertices))
221 | 
222 |             # split vertices into upper and lower part
223 |             hull_vertices_section_1 = hull_vertices[:np.argmax(hull_vertices)
224 |                                                     + 1]
225 |             hull_vertices_section_2 = np.sort(
226 |                 np.insert(hull_vertices[np.argmax(hull_vertices):], 0,
227 |                           hull_vertices[0]))
228 | 
229 |             # calculate spectrum mean intensities of upper and lower vertices
230 |             raw_mean_1 = np.mean(current_spectrum[hull_vertices_section_1])
231 |             raw_mean_2 = np.mean(current_spectrum[hull_vertices_section_2])
232 | 
233 |             # Select lower vertices as baseline vertices
234 |             if raw_mean_1 > raw_mean_2:
235 |                 baseline_vertices = hull_vertices_section_2
236 |             else:
237 |                 baseline_vertices = hull_vertices_section_1
238 | 
239 |             # Create baseline using linear interpolation between vertices
240 |             baseline_data[ii, :] = np.interp(
241 |                 wavenumbers, np.flip(wavenumbers[baseline_vertices]),
242 |                 np.flip(current_spectrum[baseline_vertices]))
243 | 
244 |         if ascending_wn:
245 |             baseline_data = np.flip(baseline_data, axis=1)
246 | 
247 |     elif mode == baseline_modes[1]:  # ALSS
248 |         # according to
249 |         # "Baseline Correction with Asymmetric Least Squares Smoothing"
250 |         # by P. Eilers and H. Boelens.
251 |         # https://zanran_storage.s3.amazonaws.com/www.science.uva.nl/
252 |         # ContentPages/443199618.pdf
253 | 
254 |         # set mode specific parameters
255 |         lam = kwargs.get('lam', 10000)
256 |         p = kwargs.get('p', 0.001)
257 |         n_iter = kwargs.get('n_iter', 10)
258 |         conv_crit = kwargs.get('conv_crit', 0.001)
259 |         #############################
260 | 
261 |         L = raw_data.shape[1]
262 |         D = diags([1, -2, 1], [0, -1, -2], shape=(L, L-2), format='csr')
263 |         D = D.dot(D.transpose())
264 | 
265 |         for ii, current_spectrum in enumerate(tqdm(raw_data)):
266 | 
267 |             # this is the code for the fitting procedure
268 |             w = np.ones(L)
269 |             W = diags(w, format='csr')
270 |             z = w
271 | 
272 |             for jj in range(int(n_iter)):
273 |                 W.setdiag(w)
274 |                 Z = W + lam * D
275 |                 z_prev = z
276 |                 z = spsolve(Z, w*current_spectrum, permc_spec='NATURAL')
277 |                 if np.linalg.norm(z - z_prev) > conv_crit:
278 |                     w = p * (current_spectrum > z) + (1-p) * (
279 |                         current_spectrum < z)
280 |                 else:
281 |                     break
282 |             # end of fitting procedure
283 | 
284 |             baseline_data[ii, :] = z
285 | 
286 |     elif mode == baseline_modes[2]:  # iALSS
287 |         # according to "Anal. Methods, 2014, 6, 4402–4407."
288 | 
289 |         # set mode specific parameters
290 |         lam = kwargs.get('lam', 2000)
291 |         lam_1 = kwargs.get('lam_1', 0.01)
292 |         p = kwargs.get('p', 0.01)
293 |         n_iter = kwargs.get('n_iter', 10)
294 |         conv_crit = kwargs.get('conv_crit', 0.001)
295 |         #############################
296 | 
297 |         L = raw_data.shape[1]
298 |         fit_coeffs = np.polynomial.polynomial.polyfit(wavenumbers,
299 |                                                       raw_data.T, 2)
300 |         w_start_all = np.polynomial.polynomial.polyval(wavenumbers, fit_coeffs)
301 | 
302 |         D = diags([1, -2, 1], [0, -1, -2], shape=(L, L-2), format='csr')
303 |         D = D.dot(D.transpose())
304 |         D_1 = diags([-1, 1], [0, -1], shape=(L, L-1), format='csr')
305 |         D_1 = D_1.dot(D_1.transpose())
306 | 
307 |         for ii, current_spectrum in enumerate(tqdm(raw_data)):
308 | 
309 |             # this is the code for the fitting procedure
310 |             w = w_start_all[ii, :]
311 |             z = w
312 |             W = diags(w, format='csr')
313 |             w = p * (current_spectrum > z) + (1-p) * (current_spectrum < z)
314 | 
315 |             for jj in range(int(n_iter)):
316 |                 W.setdiag(w)
317 |                 W = W.dot(W.transpose())
318 |                 Z = W + lam_1 * D_1 + lam * D
319 |                 R = (W + lam_1 * D_1) * current_spectrum
320 |                 z_prev = z
321 |                 z = spsolve(Z, R, permc_spec='NATURAL')
322 |                 if np.linalg.norm(z - z_prev) > conv_crit:
323 |                     w = p * (current_spectrum > z) + (1-p) * (
324 |                         current_spectrum < z)
325 |                 else:
326 |                     break
327 |             # end of fitting procedure
328 | 
329 |             baseline_data[ii, :] = z
330 | 
331 |     elif mode == baseline_modes[3]:  # drPLS
332 |         # according to "Applied Optics, 2019, 58, 3913-3920."
333 | 
334 |         # set mode specific parameters
335 |         lam = kwargs.get('lam', 1000000)
336 |         eta = kwargs.get('eta', 0.5)
337 |         n_iter = kwargs.get('n_iter', 100)
338 |         conv_crit = kwargs.get('conv_crit', 0.001)
339 |         #############################
340 | 
341 |         L = raw_data.shape[1]
342 | 
343 |         D = diags([1, -2, 1], [0, -1, -2], shape=(L, L-2), format='csr')
344 |         D = D.dot(D.transpose())
345 |         D_1 = diags([-1, 1], [0, -1], shape=(L, L-1), format='csr')
346 |         D_1 = D_1.dot(D_1.transpose())
347 | 
348 |         w_0 = np.ones(L)
349 |         I_n = diags(w_0, format='csr')
350 | 
351 |         for ii, current_spectrum in enumerate(tqdm(raw_data)):
352 | 
353 |             # this is the code for the fitting procedure
354 |             w = w_0
355 |             W = diags(w, format='csr')
356 |             Z = w_0
357 | 
358 |             for jj in range(int(n_iter)):
359 |                 W.setdiag(w)
360 |                 Z_prev = Z
361 |                 Z = spsolve(W + D_1 + lam * (I_n - eta*W) *
362 |                             D, W*current_spectrum, permc_spec='NATURAL')
363 |                 if np.linalg.norm(Z - Z_prev) > conv_crit:
364 |                     d = current_spectrum - Z
365 |                     d_negative = d[d < 0]
366 |                     sigma_negative = np.std(d_negative)
367 |                     mean_negative = np.mean(d_negative)
368 |                     w = 0.5 * (1 - np.exp(jj) * (d - (
369 |                         -mean_negative + 2*sigma_negative))/sigma_negative / (
370 |                             1 + np.abs(np.exp(jj) * (d - (
371 |                                 - mean_negative + 2 * sigma_negative)) /
372 |                                 sigma_negative)))
373 |                 else:
374 |                     break
375 |             # end of fitting procedure
376 | 
377 |             baseline_data[ii, :] = Z
378 | 
379 |     elif mode == baseline_modes[4]:  # SNIP
380 |         # according to "Nuclear Instruments and Methods in Physics Research
381 |         # 934 (1988) 396-402."
382 |         # and Nuclear Instruments and Methods in Physics Research Section A:
383 |         # Accelerators, Spectrometers, Detectors and Associated Equipment 1997,
384 |         # 401 (1), 113-132
385 | 
386 |         # set mode specific parameters
387 |         n_iter = kwargs.get('n_iter', 100)
388 |         #############################
389 | 
390 |         spectrum_points = raw_data.shape[1]
391 |         working_spectra = np.zeros_like(raw_data)
392 | 
393 |         for pp in tqdm(np.arange(1, n_iter+1)):
394 |             r1 = raw_data[:, pp:spectrum_points-pp]
395 |             r2 = (np.roll(raw_data, -pp, axis=1)[:, pp:spectrum_points-pp] +
396 |                   np.roll(raw_data, pp, axis=1)[:, pp:spectrum_points-pp])/2
397 |             working_spectra = np.minimum(r1, r2)
398 |             raw_data[:, pp:spectrum_points-pp] = working_spectra
399 | 
400 |         baseline_data = raw_data
401 | 
402 |     elif mode in baseline_modes[5:8]:  # ModPoly, IModPoly, PPF
403 |         # according to Applied Spectroscopy, 2007, 61 (11), 1225-1232.
404 |         # without dev: Chemometrics and Intelligent Laboratory Systems 82
405 |         #              (2006) 59– 65.
406 |         #              Maybe also ModPoly from first source?
407 | 
408 |         # set mode specific parameters
409 |         n_iter = kwargs.get('n_iter', 100)
410 |         if mode in baseline_modes[5:7]:  # ModPoly, IModPoly
411 |             poly_order = kwargs.get('poly_order', 5)
412 |             fixed_points = kwargs.get('fixed_points', None)
413 |             fixed_slopes = kwargs.get('fixed_slopes', None)
414 |         if mode == baseline_modes[7]:  # PPF
415 |             segment_borders = kwargs.get(
416 |                 'segment_borders', [wavenumbers[len(wavenumbers)//2]])
417 | 
418 |             poly_orders = kwargs.get('poly_orders', [3, 3])
419 |             y_at_borders = kwargs.get('y_at_borders', 'int_at_borders')
420 |             fit_method = kwargs.get('fit_method', 'ModPoly')
421 |         #############################
422 | 
423 |         if not ascending_wn:
424 |             raw_data = np.flip(raw_data, axis=1)
425 |             wavenumbers = np.flip(wavenumbers)
426 | 
427 |         wavenumbers_start = wavenumbers
428 |         # previous_dev = 0
429 | 
430 |         for ii, current_spectrum in enumerate(tqdm(raw_data)):
431 |             wavenumbers = wavenumbers_start
432 | 
433 |             if mode == baseline_modes[7]:  # 'PPF'
434 |                 if y_at_borders == 'int_at_borders':
435 |                     y_at_borders_values = y_at_x(
436 |                         segment_borders, wavenumbers, current_spectrum)
437 |                 else:
438 |                     y_at_borders_values = y_at_borders
439 | 
440 |             for jj in range(int(n_iter)):
441 |                 if mode in baseline_modes[5:7]:  # ModPoly, IModPoly
442 |                     # The polynomial_fit method from pyDataFitting is only used
443 |                     # if constraints are to be considered because the numpy
444 |                     # polyfit method is faster.
445 |                     if (fixed_points is not None) or (
446 |                             fixed_slopes is not None):
447 |                         fit_data, fit_coeffs = polynomial_fit(
448 |                             wavenumbers, current_spectrum, poly_order,
449 |                             fixed_points=fixed_points,
450 |                             fixed_slopes=fixed_slopes)
451 |                     else:
452 |                         fit_coeffs = np.polynomial.polynomial.polyfit(
453 |                             wavenumbers, current_spectrum, poly_order)
454 |                         fit_data = np.polynomial.polynomial.polyval(
455 |                             wavenumbers, fit_coeffs)
456 |                 else:  # PPF
457 |                     fit_data, fit_coeffs = piecewise_polynomial_fit(
458 |                         wavenumbers, current_spectrum, segment_borders,
459 |                         poly_orders, y_at_borders=y_at_borders_values,
460 |                         slope_at_borders=None)
461 | 
462 |                 # ModPoly or PPF with ModPoly
463 |                 if (mode == baseline_modes[5]) or (
464 |                         (mode == baseline_modes[7]) and (fit_method=='ModPoly')
465 |                         ):
466 |                     dev = 0
467 |                 # IModPoly or PPF with IModPoly
468 |                 else:
469 |                     residual = current_spectrum - fit_data
470 |                     dev = residual.std()
471 |                     # if abs((dev - previous_dev)/dev) < 0.01:
472 |                     #    break
473 | 
474 |                 if jj == 0:
475 |                     mask = (current_spectrum <= fit_data + dev)
476 |                     wavenumbers = wavenumbers[mask]
477 |                     current_spectrum = current_spectrum[mask]
478 |                     fit_data = fit_data[mask]
479 |                 np.copyto(current_spectrum, fit_data + dev,
480 |                           where=(current_spectrum >= (fit_data+dev)))
481 |                 # previous_dev = dev
482 | 
483 |             if mode in baseline_modes[5:7]:  # ModPoly, IModPoly
484 |                 baseline_data[ii, :] = np.polynomial.polynomial.polyval(
485 |                     wavenumbers_start, fit_coeffs)
486 |             else:  # PPF
487 |                 baseline_data[ii, :] = piecewise_polynomial(
488 |                     wavenumbers_start, fit_coeffs,
489 |                     segment_borders=segment_borders)
490 | 
491 |         if not ascending_wn:
492 |             baseline_data = np.flip(baseline_data, axis=1)
493 |             # raw_data = np.flip(raw_data, axis=1)
494 | 
495 |     elif mode == baseline_modes[8]: # from_measurement
496 |         spectrum_number = len(raw_data)    
497 |         spectrum_points = raw_data.shape[1]
498 |         measured_baseline = kwargs.get(
499 |             'measured_baseline', np.zeros(spectrum_points))
500 |         if len(measured_baseline) != spectrum_points:
501 |             raise ValueError(
502 |                 'The given baseline data consists of {} data points, but {} '
503 |                 'were expected due to the shape of raw_data.'.format(
504 |                     len(measured_baseline), spectrum_points))
505 |         baseline_data = np.tile(measured_baseline, spectrum_number).reshape(
506 |             spectrum_number, -1)
507 | 
508 |     else:
509 |         raise ValueError('No valid baseline mode entered. Allowed modes are '
510 |                          '{0}'.format(baseline_modes))
511 | 
512 |     if transform:
513 |         baseline_data = transform_spectra(
514 |             baseline_data, 'log_log_sqrt', direction='inverse',
515 |             min_value=spectra_minimum_value)
516 | 
517 |     return np.around(baseline_data, decimals=6)
518 | 


--------------------------------------------------------------------------------
/src/pyPreprocessing/smoothing.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Provides functions for smoothing and filtering of data rows oganized in 2D
  4 | numpy arrays.
  5 | """
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | from scipy.signal import savgol_filter
 10 | from scipy.interpolate import interp1d
 11 | from sklearn.decomposition import PCA
 12 | 
 13 | 
 14 | def smoothing(raw_data, mode, interpolate=False, point_mirror=True, **kwargs):
 15 |     """
 16 |     Smoothes data rows with different algorithms.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     raw_data : ndarray
 21 |         2D numpy array with the shape (N,M) containing N data rows to be
 22 |         smoothed. Each data row is represented by row in numpy array and
 23 |         contains M values. If only one data row is present, raw_data has the
 24 |         shape (1,M).
 25 |     mode : str
 26 |         Algorithm used for smoothing. Allowed modes are 'sav_gol' for Savitzky-
 27 |         Golay, 'rolling_median' for a median filter, 'pca' for smoothing based
 28 |         on principal component analysis, 'weighted_moving_average' for a
 29 |         moving average that uses weights, so e.g. can decide if values in the
 30 |         window are used for or excluded from averaging.
 31 |     interpolate : boolean
 32 |         False if x coordinate is evenly spaced. True if x coordinate is not
 33 |         evenly spaced, then raw_data is interpolated to an evenly spaced
 34 |         x coordinate. Default is False
 35 |     point_mirror : boolean
 36 |         Dataset is point reflected at both end points before smoothing to
 37 |         reduce artifacts at the data edges.
 38 |     **kwargs for interpolate=True
 39 |         x_coordinate : ndarray
 40 |             1D numpy array with shape (M,) used for interpolation.
 41 |         data_points : int, optional
 42 |             number of data points returned after interpolation. Default is one
 43 |             order of magnitude more than M.
 44 |         return_type : string, optional
 45 |             Defines if the interpolated dataset with a number of data_points
 46 |             is returned ('interp') or if the returned dataset has the same
 47 |             dimensions and x_coordinates like the original dataset ('orig').
 48 |             Default is 'interp'.
 49 |     **kwargs for different smoothing modes
 50 |         sav_gol:
 51 |             deriv : int
 52 |                 Derivative order to be calculated. Default is 0 (no
 53 |                 derivative).
 54 |             savgol_points : int
 55 |                 Number of point defining one side of the Savitzky-Golay window.
 56 |                 Total window is 2*savgol_points+1. Default is 9.
 57 |             poly_order : int
 58 |                 Polynomial order used for polynomial fitting of the Savitzky-
 59 |                 Golay window. Default is 2.
 60 |             savgol_mode : str
 61 |                 Must be ‘mirror’, ‘constant’, ‘nearest’, ‘wrap’ or ‘interp’.
 62 |                 See documentation of scipy.signal.savgol_filter.
 63 |         rolling_median:
 64 |             window: int
 65 |                 Data points included in rolling window used for median
 66 |                 calculations. Default is 5.
 67 |         pca:
 68 |             pca_components : int
 69 |                 Number of principal components used to reconstruct the original
 70 |                 data. Default is 5.
 71 |         weighted_moving_average:
 72 |             weights : list of float
 73 |                 The number of entries decide the window length used for
 74 |                 smoothing. A value > 0 means that the value is used with the
 75 |                 specified weight, a value of 0 means the value is excluded,
 76 |                 e.g. [1, 0, 1] is a window of size 3 in which the center point
 77 |                 is exluded from the calculations. Default is [1, 1, 0, 1, 1].
 78 | 
 79 |     Returns
 80 |     -------
 81 |     ndarray or tuple of ndarrays
 82 |         2D numpy array containing the smoothed data in the same shape as
 83 |         raw_data if interpolate is false. Else tuple containing interpolated
 84 |         x coordinates and 2D numpy array in the shape of
 85 |         (N,10**np.ceil(np.log10(len(x_coordinate)))). In case of mode is
 86 |         weighted_moving_average, the corresponding standard deviations are
 87 |         also calulated and a tuple with the smoothed data and the standard
 88 |         deviations is returned.
 89 | 
 90 |     """
 91 |     # copy of raw_data for later restoration of data edges
 92 |     raw_old = pd.DataFrame(raw_data.copy())
 93 |     # Preprocessing of input data for unevenly spaced x coordinate
 94 |     if interpolate:
 95 |         x_coordinate = kwargs.get('x_coordinate', np.linspace(
 96 |             0, 1000, raw_data.shape[1]))
 97 |         data_points = kwargs.get('data_points',
 98 |                                  int(10**np.ceil(np.log10(len(x_coordinate)))))
 99 | 
100 |         itp = interp1d(x_coordinate, raw_data, kind='linear')
101 |         x_interpolated = np.linspace(x_coordinate[0], x_coordinate[-1],
102 |                                      data_points)
103 |         raw_data = itp(x_interpolated)
104 | 
105 |     # Optional extension of smoothed data by point mirrored raw data.
106 |     if point_mirror:
107 |         raw_data = np.concatenate(
108 |             ((-np.flip(raw_data, axis=1)+2*raw_data[:, 0, np.newaxis])[:, :-1],
109 |              raw_data, (-np.flip(raw_data, axis=1) +
110 |                         2*raw_data[:, -1, np.newaxis])[:, 1:]), axis=1)
111 |         #raw_data = np.concatenate((-np.squeeze(raw_data.T)[::-1]+2*np.squeeze(raw_data.T)[0],np.squeeze(raw_data.T),-np.squeeze(raw_data.T)[::-1]+2*np.squeeze(raw_data.T)[-1]))[np.newaxis]
112 | 
113 |     smoothing_modes = ['sav_gol', 'rolling_median', 'pca',
114 |                        'weighted_moving_average']
115 | 
116 |     if mode == smoothing_modes[0]:  # sav_gol
117 |         deriv = kwargs.get('deriv', 0)
118 |         savgol_points = kwargs.get('savgol_points', 9)
119 |         poly_order = kwargs.get('poly_order', 2)
120 |         savgol_mode = kwargs.get('savgol_mode', 'nearest')
121 | 
122 |         smoothed_data = savgol_filter(raw_data, 1+2*savgol_points, poly_order,
123 |                                       deriv=deriv, axis=1, mode=savgol_mode)
124 | 
125 |     elif mode == smoothing_modes[1]:  # rolling_median
126 |         window = kwargs.get('window', 5)
127 |         # next line due to pandas rolling window, look for numpy solution
128 |         raw_data = pd.DataFrame(raw_data)
129 | 
130 |         edge_value_count = int((window-1)/2)
131 |         smoothed_data = raw_data.T.rolling(
132 |                 window, center=True).median().T.iloc[
133 |                 :, edge_value_count:-edge_value_count]
134 | 
135 |         # On the data edges, the original data is used, so the edges are not
136 |         # smoothed (only relevant if point_mirror is False).
137 |         smoothed_data = pd.concat(
138 |             [raw_old.iloc[:, 0:edge_value_count], smoothed_data,
139 |              raw_old.iloc[:, -1-edge_value_count:]], axis=1).values
140 | 
141 |     elif mode == smoothing_modes[2]:  # pca
142 |         pca_components = kwargs.get('pca_components', 5)
143 | 
144 |         pca = PCA(n_components=pca_components)
145 |         scores = pca.fit_transform(raw_data)
146 |         loadings = pca.components_
147 | 
148 |         smoothed_data = (
149 |             np.dot(scores, loadings) + np.mean(raw_data, axis=0))
150 | 
151 |     elif mode == smoothing_modes[3]:  # weighted_moving_average
152 |         weights = kwargs.get('weights', [1, 1, 0, 1, 1])
153 | 
154 |         window_size = len(weights)
155 |         value_count = raw_data.shape[1]
156 |         edge_value_count = int((window_size-1)/2)
157 |         remaining_values = value_count-window_size+1
158 | 
159 |         column_indices = np.repeat(
160 |             np.arange(window_size)[np.newaxis], remaining_values, axis=0
161 |             ) + np.arange(remaining_values)[:, np.newaxis]
162 |         # column_indices = column_indices[:, weights]
163 | 
164 |         # the following step multiplies the total value number with
165 |         # window_size, so might be problematic for large datasets
166 |         value_array = np.squeeze(raw_data[np.newaxis][:, :, column_indices])
167 |         if len(value_array.shape) == 2:
168 |             value_array = value_array[np.newaxis]
169 |         smoothed_data, selective_std = weighted_mean_std(value_array, weights)
170 |         smoothed_data = pd.DataFrame(smoothed_data)
171 | 
172 |         # selective_std = np.std(value_array, axis=2)
173 |         # On the edges, the std is calculated from the reduced number of edge
174 |         # data points (only relevant if point_mirror is False).
175 |         selective_std = np.concatenate((
176 |             np.repeat(np.std(raw_old.values[:, 0:edge_value_count], axis=1),
177 |                       edge_value_count).reshape(-1, edge_value_count),
178 |             selective_std,
179 |             np.repeat(np.std(raw_old.values[:, -edge_value_count:], axis=1),
180 |                       edge_value_count).reshape(-1, edge_value_count)
181 |             ), axis=1)
182 | 
183 |         # On the data edges, the original data is used, so the edges are not
184 |         # smoothed (only relevant if point_mirror is False).
185 |         raw_data = pd.DataFrame(raw_data)
186 |         smoothed_data = pd.concat(
187 |             [raw_old.iloc[:, 0:edge_value_count], smoothed_data,
188 |              raw_old.iloc[:, -edge_value_count:]], axis=1).values
189 | 
190 |     else:
191 |         raise ValueError('No valid smoothing mode entered. Allowed modes are '
192 |                          '{0}'.format(smoothing_modes))
193 | 
194 |     # Removal of previously added point mirrored data.
195 |     if point_mirror:
196 |         smoothed_data = smoothed_data[
197 |             :, int(np.ceil(smoothed_data.shape[1]/3)-1):
198 |                 int(2*np.ceil(smoothed_data.shape[1]/3)-1)]
199 |         if mode == smoothing_modes[3]:  # weighted_moving_average
200 |             selective_std = selective_std[
201 |                 :, int(np.ceil(selective_std.shape[1]/3)-1):
202 |                     int(2*np.ceil(selective_std.shape[1]/3)-1)]
203 | 
204 |     if interpolate:
205 |         return_type = kwargs.get('return_type', 'interp')
206 |         if return_type == 'interp':
207 |             return (x_interpolated, smoothed_data)
208 |         elif return_type == 'orig':
209 |             f = interp1d(x_interpolated, smoothed_data, kind='linear')
210 |             return (x_coordinate, f(x_coordinate))
211 |         else:
212 |             raise ValueError('No valid return_type given.')
213 |     elif mode == smoothing_modes[3]:  # weighted_moving_average
214 |         return (smoothed_data, selective_std)
215 |     else:
216 |         return smoothed_data
217 | 
218 | 
219 | def weighted_mean_std(values, weights, std=True):
220 |     """
221 |     Calculate the weighted mean and (biased) standard deviation of values.
222 | 
223 |     Parameters
224 |     ----------
225 |     values : ndarray
226 |         An n-dimensional array in the shape (..., M) with data rows with M
227 |         elements. Calculations are performed for each data row in the last
228 |         dimension of values.
229 |     weights : list of float
230 |         A list containing the weights used in the calculations. Must contain
231 |         M elements.
232 |     std : bool, optional
233 |         Decides if the weighted standard deviation is also calculated, default
234 |         is True.
235 | 
236 |     Returns
237 |     -------
238 |     weighted_mean : ndarray
239 |         An (n-1)-dimensional array containing the weighted means for the data
240 |         rows, so has the shape of values without the last dimension.
241 |     weighted_std : ndarray
242 |         An (n-1)-dimensional array containing the weighted standard deviations
243 |         for the data rows, so has the shape of values without the last
244 |         dimension. Only in case of std=True.
245 | 
246 |     """
247 |     weighted_mean = np.average(values, weights=weights, axis=-1)
248 |     if std:
249 |         weighted_std = np.sqrt(
250 |             np.average((values-weighted_mean[..., np.newaxis])**2,
251 |                        weights=weights, axis=-1))
252 |         return (weighted_mean, weighted_std)
253 |     else:
254 |         return weighted_mean
255 | 
256 | 
257 | def filtering(raw_data, mode, fill='NaN', **kwargs):
258 |     """
259 |     Filter data rows with different algorithms.
260 | 
261 |     Filtered values are replaced by np.nan.
262 | 
263 |     Parameters
264 |     ----------
265 |     raw_data : ndarray
266 |         2D numpy array with the shape (N,M) containing N data rows to be
267 |         filtered. Each data row is represented by row in numpy array and
268 |         contains M values. If only one data row is present, raw_data has the
269 |         shape (1, M).
270 |     mode : str
271 |         Algorithm used for filtering. Allowed modes are 'spike_filter' for
272 |         sharp peaks, 'max_thresh' for removal of values above or equal to a
273 |         maximum threshold, 'min_thresh' for removal of values below or equal to
274 |         a minumum threshold.
275 |     fill : str, optional
276 |         Decides the way filtered points are replaced. Currently 'NaN'
277 |         where values are replaced by np.nan, 'zeros' where values are
278 |         replaced by zeros, or 'mov_avg' (only for mode=='spike_filter') where
279 |         values are replaced by the weighted moving average.
280 |     **kwargs for different filter modes
281 |         spike_filter:
282 |             weights : list of float, optional
283 |                 The number of entries decide the window length used for
284 |                 smoothing. A value > 0 means that the value is used with the
285 |                 specified weight, a value of 0 means the value is excluded,
286 |                 e.g. [1, 0, 1] is a window of size 3 in which the center point
287 |                 is exluded from the calculations. Default is [1, 1, 0, 1, 1].
288 |             std_factor : float, optional
289 |                 The number of standard deviations a value is allowed to be away
290 |                 from the moving average before it is removed by the filter.
291 |                 Mean and standard deviation are calculated in a rolling fashion
292 |                 so that only sharp peaks are found. Default is 2.
293 |             point_mirror : bool, optional
294 |                 Decides if the data edges are point mirrored before rolling
295 |                 average. If True, estimates of mean and standard deviation also
296 |                 at the edges are obtained. If False, data at the edges are kept
297 |                 like in the original. Default is False.
298 |             interpolate : boolean, optional
299 |                 False if x coordinate is evenly spaced. True if x coordinate is
300 |                 not evenly spaced, then raw_data is interpolated to an evenly
301 |                 spaced x coordinate. Default is False
302 |         max_thresh
303 |             max:_thresh : float, optional
304 |                 The maximum threshold. Default is 1000.
305 |         min_thresh
306 |             min_thresh : float, optional
307 |                 The minimum threshold. Default is 0.
308 | 
309 |     Returns
310 |     -------
311 |     ndarray
312 |         Returns an ndarray with dimensions like raw_data. Filtered points are
313 |         changed according to the fill selected.
314 | 
315 |     """
316 |     filter_modes = ['spike_filter', 'max_thresh', 'min_thresh']
317 |     fill_values = ['NaN', 'zeros', 'mov_avg']
318 |     if fill == 'NaN':
319 |         fill_value = np.nan
320 |     elif fill == 'zeros':
321 |         fill_value = 0
322 |     elif ((fill not in fill_values) or
323 |           (fill == 'mov_avg' and mode != filter_modes[0])):
324 |         raise ValueError('No valid fill value given for this mode.')
325 | 
326 |     if mode == filter_modes[0]:  # spike_filter
327 |         weights = kwargs.get('weights', [1, 1, 0, 1, 1])
328 |         window_size = len(weights)
329 |         std_factor = kwargs.get('std_factor', 2)
330 |         point_mirror = kwargs.get('point_mirror', False)
331 |         interpolate = kwargs.get('interpolate', False)
332 | 
333 |         filtered_data = raw_data.copy()
334 |         mov_avg, mov_std = smoothing(
335 |             filtered_data, 'weighted_moving_average',
336 |             point_mirror=point_mirror, interpolate=interpolate,
337 |             weights=weights)
338 | 
339 |         diffs = np.absolute(filtered_data - mov_avg)
340 | 
341 |         if fill == 'mov_avg':
342 |             fill_value = mov_avg[diffs > std_factor*mov_std]
343 |         filtered_data[diffs > std_factor*mov_std] = fill_value
344 |         # filtered_data = raw_data
345 | 
346 |     elif mode == filter_modes[1]:  # max_thresh
347 |         maximum_threshold = kwargs.get('max_thresh', 1000)
348 |         filtered_data = raw_data.copy().astype(float)
349 |         filtered_data[filtered_data > maximum_threshold] = fill_value
350 |         # filtered_data = raw_data
351 | 
352 |     elif mode == filter_modes[2]:  # min_thresh
353 |         minimum_threshold = kwargs.get('min_thresh', 0)
354 |         filtered_data = raw_data.copy().astype(float)
355 |         filtered_data[filtered_data < minimum_threshold] = fill_value
356 |         # filtered_data = raw_data
357 | 
358 |     else:
359 |         raise ValueError('No valid filter mode entered. Allowed modes are '
360 |                          '{0}'.format(filter_modes))
361 | 
362 |     return filtered_data
363 | 


--------------------------------------------------------------------------------
/src/pyPreprocessing/transform.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Provides functions for data transformation (currently only LLS) and
  4 | normalization.
  5 | """
  6 | 
  7 | import numpy as np
  8 | 
  9 | from little_helpers.array_tools import closest_index
 10 | 
 11 | 
 12 | def transform(raw_data, mode, direction='direct', **kwargs):
 13 |     """
 14 |     Apply mathematical transformations to data.
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     raw_data : ndarray
 19 |         2D numpy array with the shape (N, M) containing N data rows to be
 20 |         smoothed. Each data row is represented by row in numpy array and
 21 |         contains M values. If only one data row is present, raw_data has the
 22 |         shape (1, M).
 23 |     mode : str
 24 |         Maths used for transformation. Allowed mode is 'log_log_sqrt' only at
 25 |         the moment which first takes the square root and then does the
 26 |         logarithm twice.
 27 |     direction : str, optional
 28 |         Gives the direction of the tranformation. If 'direct', the data is
 29 |         transformed, if 'inverse', the inverse of the transformation is
 30 |         calculated. The default is 'direct'.
 31 |     **kwargs for the different modes
 32 |         mode is 'log_log_sqrt' and direction is 'inverse':
 33 |             min_value : float
 34 |                 Original minimum value of the data before transformation. Has
 35 |                 to be known because it is lost upon transformation. Default is
 36 |                 1.
 37 | 
 38 |     Raises
 39 |     ------
 40 |     ValueError
 41 |         If the value passed as mode or direction is not understood.
 42 | 
 43 |     Returns
 44 |     -------
 45 |     raw_data : ndarray
 46 |         Transformed data with the same shape as raw_data.
 47 | 
 48 |     """
 49 |     # list of allowed modes for data transformation
 50 |     transform_modes = ['log_log_sqrt']
 51 | 
 52 |     if direction == 'direct':
 53 |         if mode == transform_modes[0]:
 54 |             minimum_value = np.min(raw_data)
 55 |             raw_data -= minimum_value
 56 |             raw_data = np.log(np.log(np.sqrt(raw_data + 1) + 1) + 1)
 57 |         else:
 58 |             raise ValueError('No valid transform mode entered. Allowed modes '
 59 |                              'are {0}'.format(transform_modes))
 60 | 
 61 |     elif direction == 'inverse':
 62 |         if mode == transform_modes[0]:
 63 |             minimum_value = kwargs.get('min_value', 1)
 64 |             raw_data = (np.exp(np.exp(raw_data) - 1) - 1)**2 - 1
 65 |             raw_data += minimum_value
 66 |         else:
 67 |             raise ValueError('No valid transform mode entered. Allowed modes '
 68 |                              'are {0}'.format(transform_modes))
 69 |     else:
 70 |         raise ValueError('No valid transform direction entered. Allowed '
 71 |                          'directions are [\'direct\', \'inverse\']')
 72 | 
 73 |     return raw_data
 74 | 
 75 | 
 76 | def normalize(raw_data, mode, factor=1, **kwargs):
 77 |     '''
 78 |     Normalize data such as spectra to a certain value.
 79 | 
 80 | 
 81 |     Parameters
 82 |     ----------
 83 |     raw_data : ndarray
 84 |         2D numpy array with the shape (N, M) containing N data rows to be
 85 |         normalized. Each data row is represented by row in numpy array and
 86 |         contains M values. If only one data row is present, raw_data has the
 87 |         shape (1, M).
 88 |     mode : string
 89 |         The mode of data normalization. Allowed modes are 'total_intensity'
 90 |         (total integral under the data is set to a specific value), 'integral'
 91 |         (integral under parts of the data is set to  a specific value), or
 92 |         'max_intensity' (data is divided by maximum intensity).
 93 |     factor : float, optional
 94 |         The value the normalized parameter has after the operation. The default
 95 |         is 1.
 96 |     **kwargs for the different modes
 97 |         mode is 'total_intensity' or 'integral':
 98 |             x_data : ndarray or list
 99 |                 A 1D numpy array or list containing the x data (such as
100 |                 wavenumbers) corresponding to raw_data. Should be sorted in an
101 |                 ascending order.
102 |         mode is 'integral':
103 |             limits : list
104 |                 A list of two numbers giving the values in x_data which define
105 |                 the limits of the integration. If this is not given, the mode
106 |                 'integral' behaves identical to 'total_intensity'.
107 | 
108 |     Returns
109 |     -------
110 |     normalized_data : ndarray
111 |         Normalized data with the same shape as raw_data.
112 | 
113 |     '''
114 |     raw_data = np.asarray(raw_data)
115 | 
116 |     # list of allowed modes for normalization
117 |     normalize_modes = ['total_intensity', 'integral', 'max_intensity']
118 | 
119 |     if mode in normalize_modes[0:2]:  # 'total_intensity', 'integral'
120 |         if 'x_data' in kwargs:
121 |             x_data = np.asarray(kwargs.get('x_data'))
122 |         else:
123 |             raise TypeError(
124 |                 'For mode \'total_intensity\' or \'integral\', x_data must be '
125 |                 'provided.')
126 | 
127 |         if 'limits' in kwargs:
128 |             limits = kwargs.get('limits')
129 |             limit_idx = closest_index(limits, x_data)
130 |         else:
131 |             limit_idx = [0, len(x_data)-1]
132 | 
133 |         integral = np.trapezoid(
134 |             raw_data[:, limit_idx[0]:limit_idx[1]+1],
135 |             x=x_data[limit_idx[0]:limit_idx[1]+1], axis=1)[:, np.newaxis]
136 | 
137 |         conversion_factor = 1/integral
138 | 
139 |     elif mode == normalize_modes[2]:  # 'max_intensity'
140 |         conversion_factor = 1/raw_data.max(axis=1)[:, np.newaxis]
141 | 
142 |     else:
143 |         raise ValueError('No valid normalization mode entered. Allowed modes '
144 |                          'are {0}'.format(normalize_modes))
145 | 
146 |     normalized_data = raw_data * conversion_factor * factor
147 |     return normalized_data
148 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tests/test_baseline_correction.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sat Oct 16 19:59:07 2021
 5 | 
 6 | @author: Alexander Southan
 7 | """
 8 | 
 9 | import numpy as np
10 | import matplotlib.pyplot as plt
11 | import unittest
12 | 
13 | from src.pyPreprocessing import baseline_correction
14 | 
15 | 
16 | class TestBaselineCorrection(unittest.TestCase):
17 | 
18 |     def test_baseline_correction(self):
19 | 
20 |         # Calculate a simple spectrum
21 |         centers = np.array([1200, 1600])
22 |         amps = np.array([200, 100])
23 |         widths = np.array([30, 500])
24 |         noise_factor = 5
25 | 
26 |         wavenumbers = np.linspace(1000, 2000, 1001)
27 |         intensities = (
28 |             (amps*np.exp(-(wavenumbers[:, None]-centers)**2/widths)).sum(axis=1) +
29 |             noise_factor * np.random.normal(size=wavenumbers.size))
30 |         background = 0.0001* wavenumbers**2
31 |         spectrum = intensities + background
32 | 
33 |         # Calculate baselines with different methods
34 |         baseline_snip = baseline_correction.generate_baseline(
35 |             spectrum[None], 'SNIP')
36 |         baseline_convhull = baseline_correction.generate_baseline(
37 |             spectrum[None], 'convex_hull', wavenumbers=wavenumbers)
38 |         baseline_alss = baseline_correction.generate_baseline(
39 |             spectrum[None], 'ALSS')
40 |         baseline_ialss = baseline_correction.generate_baseline(
41 |             spectrum[None], 'iALSS', wavenumbers=wavenumbers)
42 |         baseline_drpls = baseline_correction.generate_baseline(
43 |             spectrum[None], 'drPLS')
44 |         baseline_modpoly = baseline_correction.generate_baseline(
45 |             spectrum[None], 'ModPoly', wavenumbers=wavenumbers)
46 |         baseline_imodpoly = baseline_correction.generate_baseline(
47 |             spectrum[None], 'IModPoly', wavenumbers=wavenumbers)
48 |         baseline_ppf = baseline_correction.generate_baseline(
49 |             spectrum[None], 'PPF', wavenumbers=wavenumbers)
50 | 
51 |         spectrum_corrected = baseline_correction.correct_baseline(
52 |             spectrum[None], 'SNIP')
53 | 
54 |         self.assertTrue(np.all(spectrum_corrected == spectrum-baseline_snip))
55 | 
56 |         # test with transformation
57 |         baseline_transform = baseline_correction.generate_baseline(
58 |             spectrum[None], 'SNIP', transform=True)
59 | 
60 |         # test with flipped spectrum
61 |         baseline_snip_desc = baseline_correction.generate_baseline(
62 |             spectrum[::-1][None], 'SNIP')
63 |         self.assertTrue(np.all(baseline_snip_desc[0, ::-1] == baseline_snip))
64 | 
65 |         # test with descending wavenumbers
66 |         baseline_modpoly_desc = baseline_correction.generate_baseline(
67 |             spectrum[::-1][None], 'ModPoly', wavenumbers=wavenumbers[::-1])
68 |         self.assertTrue(
69 |             np.all(baseline_modpoly_desc[0, ::-1] == baseline_modpoly))
70 | 
71 |         # PPF with fixed border values
72 |         baseline_ppf_fixed = baseline_correction.generate_baseline(
73 |             spectrum[None], 'PPF', wavenumbers=wavenumbers, y_at_borders=[250],
74 |             segment_borders=[1400.0])
75 |         wn_idx = np.argmax(wavenumbers==1400)
76 |         self.assertEqual(baseline_ppf_fixed[0, wn_idx], 250)
77 | 
78 |         # ModPoly with fixed points
79 |         baseline_modpoly_fixed = baseline_correction.generate_baseline(
80 |             spectrum[None], 'ModPoly', wavenumbers=wavenumbers,
81 |             fixed_points=[[1400, 250]])
82 |         self.assertEqual(baseline_modpoly_fixed[0, wn_idx], 250)
83 | 
84 |         # test error messages
85 |         self.assertRaises(ValueError, baseline_correction.generate_baseline,
86 |                           spectrum[None], 'ModPoy')
87 | 
88 |         plt.plot(wavenumbers, spectrum)
89 |         plt.plot(wavenumbers, baseline_snip.T, label='SNIP')
90 |         plt.plot(wavenumbers, baseline_convhull.T, label='convex hull')
91 |         plt.plot(wavenumbers, baseline_alss.T, label='ALSS')
92 |         plt.plot(wavenumbers, baseline_ialss.T, label='iALSS')
93 |         plt.plot(wavenumbers, baseline_drpls.T, label='drPLS')
94 |         plt.plot(wavenumbers, baseline_modpoly.T, label='ModPoly')
95 |         plt.plot(wavenumbers, baseline_imodpoly.T, label='IModPoly')
96 |         plt.plot(wavenumbers, baseline_ppf.T, label='PPF')
97 |         plt.legend()
98 | 


--------------------------------------------------------------------------------
/tests/test_smoothing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sat Oct 16 19:59:07 2021
  5 | 
  6 | @author: Alexander Southan
  7 | """
  8 | 
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | import unittest
 12 | 
 13 | from src.pyPreprocessing import smoothing
 14 | 
 15 | 
 16 | class TestSmoothing(unittest.TestCase):
 17 | 
 18 |     def test_smoothing(self):
 19 |         x = np.linspace(0, 10, 1100)
 20 |         noise = np.random.normal(size=(50, len(x)))
 21 | 
 22 |         x_interp, noise_savgol = smoothing.smoothing(
 23 |             noise, 'sav_gol', interpolate=True,
 24 |             x_coordinate=x, return_type='interp', savgol_points=10,
 25 |             window=15, data_points=1200, point_mirror=True)
 26 |         self.assertEqual(len(x_interp), len(noise_savgol.T))
 27 |         self.assertEqual(len(x_interp), 1200)
 28 |         self.assertTrue(noise.std() > noise_savgol.std())
 29 | 
 30 |         x_interp_2, noise_savgol_2 = smoothing.smoothing(
 31 |             noise, 'sav_gol', interpolate=True,
 32 |             x_coordinate=x, return_type='orig', savgol_points=10,
 33 |             window=15, data_points=1200, point_mirror=True)
 34 |         self.assertEqual(len(x_interp_2), len(noise_savgol_2.T))
 35 |         self.assertEqual(len(x_interp_2), 1100)
 36 |         self.assertTrue(noise.std() > noise_savgol_2.std())
 37 | 
 38 |         noise_rollingmedian = smoothing.smoothing(
 39 |             noise, 'rolling_median', window=10)
 40 |         self.assertTrue(noise.std() > noise_rollingmedian.std())
 41 | 
 42 |         noise_pca = smoothing.smoothing(noise, 'pca', pca_components=2)
 43 |         self.assertTrue(noise.std() > noise_pca.std())
 44 | 
 45 |         noise_weightedaverage, _ = smoothing.smoothing(
 46 |             noise, 'weighted_moving_average')
 47 |         self.assertTrue(noise.std() > noise_weightedaverage.std())
 48 | 
 49 |         # test with only one dataset
 50 |         noise_rollingmedian_single = smoothing.smoothing(
 51 |             noise[[0]], 'rolling_median', window=10)
 52 | 
 53 |         # test errors
 54 |         self.assertRaises(ValueError, smoothing.smoothing, noise[[0]],
 55 |                           'roling_median')
 56 |         self.assertRaises(ValueError, smoothing.smoothing, noise, 'sav_gol',
 57 |                           interpolate=True, x_coordinate=x, return_type='irp',
 58 |                           savgol_points=10, window=15, data_points=1100,
 59 |                           point_mirror=True)
 60 | 
 61 |     def test_filtering(self):
 62 |         x = np.linspace(0, 10, 1100)
 63 |         noise = np.random.normal(size=(50, len(x)))
 64 |         noise[:, 500] = 300
 65 |         noise[:, 700] = -300
 66 | 
 67 |         # test spike filter
 68 |         noise_spike = smoothing.filtering(noise, 'spike_filter')
 69 |         self.assertTrue(np.all(np.isnan(noise_spike[:, 500])))
 70 |         self.assertTrue(np.all(np.isnan(noise_spike[:, 700])))
 71 | 
 72 |         noise_spike_2 = smoothing.filtering(
 73 |             noise, 'spike_filter', fill='mov_avg',
 74 |             weights=[1, 0.2, 1, 0, 0.5, 1, 1])
 75 |         check_avg = (noise[:, 497] + 0.2 * noise[:, 498] + noise[:, 499] + 
 76 |                      0.5 * noise[:, 501] + noise[:, 502] + noise[:, 503])/4.7
 77 |         self.assertTrue(np.all(noise_spike_2[:, 500]==check_avg))
 78 | 
 79 |         noise_spike_3 = smoothing.filtering(
 80 |             noise, 'spike_filter', fill='zeros')
 81 |         self.assertTrue(np.all(noise_spike_3[:, 500] == 0))
 82 |         self.assertTrue(np.all(noise_spike_3[:, 700] == 0))
 83 | 
 84 |         # test maximum threshold
 85 |         noise_maxthresh = smoothing.filtering(noise, 'max_thresh',
 86 |                                               max_thresh=299)
 87 |         self.assertTrue(np.all(np.isnan(noise_maxthresh[:, 500])))
 88 |         self.assertFalse(np.any(np.isnan(noise_maxthresh[:, 700])))
 89 | 
 90 |         noise_maxthresh_2 = smoothing.filtering(noise, 'max_thresh',
 91 |                                               max_thresh=301)
 92 |         self.assertFalse(np.any(np.isnan(noise_maxthresh_2[:, 500])))
 93 |         self.assertFalse(np.any(np.isnan(noise_maxthresh_2[:, 700])))
 94 | 
 95 |         # test minimum threshold
 96 |         noise_minthresh = smoothing.filtering(noise, 'min_thresh',
 97 |                                               min_thresh=-299)
 98 |         self.assertFalse(np.any(np.isnan(noise_minthresh[:, 500])))
 99 |         self.assertTrue(np.all(np.isnan(noise_minthresh[:, 700])))
100 | 
101 |         noise_minthresh_2 = smoothing.filtering(noise, 'min_thresh',
102 |                                               min_thresh=-301)
103 |         self.assertFalse(np.any(np.isnan(noise_minthresh_2[:, 500])))
104 |         self.assertFalse(np.any(np.isnan(noise_minthresh_2[:, 700])))
105 | 
106 |         # test errors
107 |         self.assertRaises(ValueError, smoothing.filtering, noise,
108 |                           'max_thresh', fill='mov_avg')
109 |         self.assertRaises(ValueError, smoothing.filtering, noise,
110 |                           'max_thresh', fill='zero')
111 |         self.assertRaises(ValueError, smoothing.filtering, noise,
112 |                           'spike_fil')


--------------------------------------------------------------------------------
/tests/test_transform.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sat Oct 16 19:59:07 2021
 5 | 
 6 | @author: Alexander Southan
 7 | """
 8 | 
 9 | import numpy as np
10 | import unittest
11 | 
12 | from src.pyPreprocessing import transform
13 | 
14 | 
15 | class TestTransform(unittest.TestCase):
16 | 
17 |     def test_transform(self):
18 |         x = np.linspace(0, 10, 1100)
19 |         y = x**2 -30
20 | 
21 |         # test lls transformation
22 |         y_lls = transform.transform([y], 'log_log_sqrt', direction='direct')
23 |         y_lls_inv = transform.transform(
24 |             y_lls, 'log_log_sqrt', direction='inverse', min_value=y.min())
25 |         self.assertTrue(np.allclose(y, y_lls_inv[0]))
26 | 
27 |         # test errors
28 |         self.assertRaises(
29 |             ValueError, transform.transform, [y], 'log_log_sq',
30 |             direction='direct')
31 |         self.assertRaises(
32 |             ValueError, transform.transform, [y], 'log_log_sq',
33 |             direction='inverse')
34 |         self.assertRaises(
35 |             ValueError, transform.transform, [y], 'log_log_sqrt',
36 |             direction='dir')
37 | 
38 |     def test_normalize(self):
39 |         x = np.linspace(0, 10, 1100)
40 |         y = x**2 -30
41 | 
42 |         y_norm = transform.normalize([y], 'total_intensity', x_data=x)
43 |         self.assertAlmostEqual(np.trapezoid(y_norm, x=x, axis=1)[0], 1)
44 | 
45 |         y_norm_2 = transform.normalize([y], 'total_intensity', x_data=x,
46 |                                        factor=3.25)
47 |         self.assertAlmostEqual(np.trapezoid(y_norm_2, x=x, axis=1)[0], 3.25)
48 | 
49 |         # test errors
50 |         self.assertRaises(ValueError, transform.normalize, [y], 'tot_int')
51 | 


--------------------------------------------------------------------------------