├── .github └── workflows │ └── python-package.yml ├── .gitignore ├── LICENSE ├── README.md ├── environment.yml ├── examples └── baseline_correction_examples.py ├── requirements.txt ├── setup.cfg ├── setup.py ├── src └── pyPreprocessing │ ├── __init__.py │ ├── baseline_correction.py │ ├── smoothing.py │ └── transform.py └── tests ├── __init__.py ├── test_baseline_correction.py ├── test_smoothing.py └── test_transform.py /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: build 5 | 6 | on: 7 | push: 8 | branches: [ "master" ] 9 | pull_request: 10 | branches: [ "master" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.9", "3.10", "3.11"] 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 pytest pytest-cov 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | - name: Lint with flake8 33 | run: | 34 | # stop the build if there are Python syntax errors or undefined names 35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 38 | - name: Test with pytest 39 | run: | 40 | pytest --cov=./ --cov-report=xml 41 | - name: Upload coverage to Codecov 42 | uses: codecov/codecov-action@v5 43 | with: 44 | token: ${{ secrets.CODECOV_TOKEN }} 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | to_do.txt 3 | .pylint.d 4 | /dist/ 5 | /src/*.egg-info -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Alexander Southan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) 2 | [![build workflow](https://github.com/AlexanderSouthan/pyPreprocessing/actions/workflows/python-package.yml/badge.svg)](https://github.com/AlexanderSouthan/pyPreprocessing/actions/workflows/python-package.yml) 3 | [![codecov](https://codecov.io/gh/AlexanderSouthan/pyPreprocessing/branch/master/graph/badge.svg?token=7GN1K2MVJ3)](https://codecov.io/gh/AlexanderSouthan/pyPreprocessing) 4 | 5 | # pyPreprocessing 6 | ## General information 7 | For preprocessing of datasets like Raman spectra, infrared spectra, UV/Vis 8 | spectra, but also HPLC data and many other types of data, currently via 9 | baseline correction, smoothing, filtering, transformation, normalization and 10 | derivative. It relies on numpy, pandas, scipy, tqdm and scikit-learn, but also 11 | on https://github.com/AlexanderSouthan/pyDataFitting for the introduction of 12 | equality constraints into the polynomial baseline estimation methods, and 13 | https://github.com/AlexanderSouthan/little_helpers. 14 | 15 | ## Documentation 16 | Please visit: 17 | https://alexandersouthan.github.io/pyPreprocessing/ 18 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: pyPreprocessing 2 | dependencies: 3 | - numpy 4 | - pandas 5 | - matplotlib 6 | - scikit-learn 7 | - scipy 8 | - tqdm 9 | - pip 10 | - pip: 11 | - pyDataFitting 12 | - little_helpers 13 | -------------------------------------------------------------------------------- /examples/baseline_correction_examples.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | 8 | from pyPreprocessing.baseline_correction import generate_baseline 9 | from pyPreprocessing.smoothing import smoothing 10 | from little_helpers.math_functions import gaussian 11 | from little_helpers.num_derive import derivative 12 | 13 | 14 | def simulate_spectrum(peak_centers, peak_amplitudes, peak_widths, 15 | baseline_type='polynomial', baseline_parameters=[1], 16 | noise_level=1, wn_start=0, wn_end=1000, 17 | data_points=1000): 18 | """ 19 | Calculate spectrum with Gaussian peaks. 20 | 21 | Parameters 22 | ---------- 23 | peak_centers : list of float 24 | The peak centers. 25 | peak_amplitudes : list of float 26 | The peak amplitudes, i.e. the maximum value of the peak. 27 | peak_widths : list of float 28 | The sigma of the Gaussian paeks. 29 | baseline_type : str, optional 30 | The baseline type, currently only 'polynomial' using the calc_function 31 | polynomial calculation. The default is 'polynomial'. 32 | baseline_parameters : list of float, optional 33 | Parameters passed to calc_function. The default is [1], resulting in a 34 | constant baseline with a value of 1 in case of baseline_type is 35 | 'polynomial'. 36 | noise_level : float, optional 37 | The maximum level of the noise. The default is 1. 38 | wn_start : float, optional 39 | The start wavenumber used for spectrum calculation. The default is 0. 40 | wn_end : float, optional 41 | The end wavenumber used for spectrum calculation. The default is 1000. 42 | data_points : int, optional 43 | The number of evenly spaced data points between wn_start and wn_end 44 | used for spectrum calculation. The default is 1000. 45 | 46 | Returns 47 | ------- 48 | ndarray 49 | 2D array with the wavenumbers and the intensities. 50 | 51 | """ 52 | # Calculate wavennumbers 53 | wavenumbers = np.linspace(wn_start, wn_end, num=data_points) 54 | 55 | # Pass Gaussian paramters to gaussian for pure 56 | # spectrum intensities without noise and baseline contributions 57 | pure_intensities = gaussian(wavenumbers, peak_amplitudes, peak_centers, 58 | np.zeros_like(peak_amplitudes), peak_widths) 59 | 60 | # Calculate noise as random Gaussian noise 61 | rng = np.random.default_rng() 62 | noise = rng.standard_normal(len(pure_intensities)) * noise_level 63 | 64 | # Calculate baseline 65 | if baseline_type == 'polynomial': 66 | baseline = np.polynomial.polynomial.polyval(wavenumbers, baseline_parameters) 67 | else: 68 | baseline = np.zeros_like(pure_intensities) 69 | 70 | # Calculate spectrum intensities as the sum of pure intensities, noise and 71 | # baseline contribution 72 | intensities = pure_intensities + noise + baseline 73 | 74 | return np.array([wavenumbers, intensities]) 75 | 76 | 77 | spectrum = simulate_spectrum([200, 250, 500], [10, 5, 20], [10, 40, 5], 78 | baseline_parameters=[5, 0.01, 0.0003], noise_level=1) 79 | spectrum_clean = simulate_spectrum([200, 250, 500], [10, 5, 20], [10, 40, 5], 80 | baseline_parameters=[0], noise_level=0) 81 | 82 | smoothed_spectrum = smoothing(spectrum[1][np.newaxis], 'sav_gol', savgol_points=10, savgol_order=9) 83 | derived_spectrum = derivative(spectrum[0], smoothed_spectrum) 84 | derived_spectrum_2 = derivative(spectrum[0], smoothed_spectrum, order=2) 85 | 86 | # baseline_ALSS = np.squeeze( 87 | # generate_baseline( 88 | # spectrum[1][np.newaxis], 'ALSS', smoothing=True)) 89 | # baseline_iALSS = np.squeeze( 90 | # generate_baseline( 91 | # spectrum[1][np.newaxis], 'iALSS', smoothing=True)) 92 | # baseline_drPLS = np.squeeze( 93 | # generate_baseline( 94 | # spectrum[1][np.newaxis], 'drPLS', smoothing=True)) 95 | # baseline_SNIP = np.squeeze( 96 | # generate_baseline( 97 | # spectrum[1][np.newaxis], 'SNIP', smoothing=True, transform=False)) 98 | # baseline_ModPoly = np.squeeze( 99 | # generate_baseline( 100 | # spectrum[1][np.newaxis], 'ModPoly', smoothing=True, 101 | # wavenumbers=spectrum[0])) 102 | baseline_IModPoly = np.squeeze( 103 | generate_baseline( 104 | spectrum[1][np.newaxis], 'IModPoly', smoothing=True, 105 | wavenumbers=spectrum[0], poly_order=3)) 106 | # baseline_convex_hull = np.squeeze( 107 | # generate_baseline( 108 | # spectrum[1][np.newaxis], 'convex_hull', smoothing=True, 109 | # wavenumbers=spectrum[0])) 110 | baseline_PPF = np.squeeze( 111 | generate_baseline( 112 | spectrum[1][np.newaxis], 'PPF', smoothing=True, 113 | wavenumbers=spectrum[0], segment_borders=[500], poly_orders=[1, 1], 114 | y_at_borders=[80])) 115 | 116 | plt.figure() 117 | plt.plot(spectrum[0], spectrum[1]) 118 | plt.plot(spectrum[0], smoothed_spectrum.T) 119 | # plt.plot(spectrum[0], baseline_ALSS, label='ALSS') 120 | # plt.plot(spectrum[0], baseline_iALSS, label='iALLS') 121 | # plt.plot(spectrum[0], baseline_drPLS, label='drPLS') 122 | # plt.plot(spectrum[0], baseline_SNIP, label='SNIP') 123 | # plt.plot(spectrum[0], baseline_ModPoly, label='ModPoly') 124 | plt.plot(spectrum[0], baseline_IModPoly, label='IModPoly') 125 | # plt.plot(spectrum[0], baseline_convex_hull, label='Convex hull') 126 | plt.plot(spectrum[0], baseline_PPF, label='PPF') 127 | # # plt.plot(spectrum[0], np.squeeze(smoothing(spectrum[1][np.newaxis], 'sav_gol', savgol_points=19))) 128 | plt.legend() 129 | 130 | plt.figure() 131 | plt.plot(spectrum[0], np.squeeze(derived_spectrum)) 132 | 133 | plt.figure() 134 | plt.plot(spectrum[0], np.squeeze(derived_spectrum_2)) 135 | 136 | plt.figure() 137 | plt.plot(spectrum[0], spectrum[1]-baseline_IModPoly.T) 138 | plt.plot(spectrum[0], spectrum_clean[1]) 139 | 140 | std_rolling = pd.Series(spectrum[1]).rolling(25, center=True).std() 141 | mean_rolling = pd.Series(spectrum[1]).rolling(25, center=True).mean() 142 | 143 | plt.figure() 144 | plt.plot(spectrum[0], std_rolling*mean_rolling) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | matplotlib 4 | scikit-learn 5 | scipy 6 | tqdm 7 | pyDataFitting 8 | little_helpers 9 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = pyPreprocessing 3 | version = 0.0.2 4 | author = Alexander Southan 5 | author_email = alexander.southan@web.de 6 | description = package preprocessing of datasets, especially from spectroscopy 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | url = https://github.com/AlexanderSouthan/pyPreprocessing 10 | project_urls = 11 | Bug Tracker = https://github.com/AlexanderSouthan/pyPreprocessing/issues 12 | classifiers = 13 | Programming Language :: Python :: 3 14 | License :: OSI Approved :: MIT License 15 | Operating System :: OS Independent 16 | 17 | [options] 18 | package_dir = 19 | = src 20 | packages = find: 21 | python_requires = >=3.6 22 | 23 | [options.packages.find] 24 | where = src 25 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='pyPreprocessing', 5 | version='0.0.2', 6 | packages=find_packages(where='src'), 7 | install_requires=['numpy', 'pandas', 'scipy', 'matplotlib', 'scikit-learn', 'tqdm'], 8 | dependency_links=['http://github.com/user/repo/tarball/master#egg=package-1.0'] 9 | ) 10 | -------------------------------------------------------------------------------- /src/pyPreprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | from . import * 2 | -------------------------------------------------------------------------------- /src/pyPreprocessing/baseline_correction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Provides functions correct_baseline and generate_baseline which can be used for 4 | baseline preprocessing of spectral data. See function docstrings for more 5 | detail. 6 | """ 7 | 8 | import numpy as np 9 | from tqdm import tqdm 10 | from scipy.sparse import diags 11 | from scipy.sparse.linalg import spsolve 12 | from scipy.spatial import ConvexHull 13 | 14 | from pyDataFitting.polynomial_regression import (polynomial_fit, 15 | piecewise_polynomial_fit) 16 | from little_helpers.math_functions import piecewise_polynomial 17 | from .transform import transform as transform_spectra 18 | from .smoothing import smoothing as smooth_spectra 19 | from little_helpers.array_tools import y_at_x 20 | 21 | 22 | def correct_baseline(raw_data, mode, smoothing=True, transform=False, 23 | **kwargs): 24 | """ 25 | Subtract baseline data from raw_data. 26 | 27 | Baseline data is either given explicitly with mode='direct', or is 28 | calculated with generate_baseline(...). In the former case, it has to have 29 | the same number of data points like raw_data has per dataset, or it has to 30 | have the same shape like raw_data. In the latter case the function takes 31 | the same arguments like generate_baseline, for details see docstring of 32 | generate_baseline. 33 | """ 34 | # baseline_data = np.array([]) 35 | if mode == 'direct': 36 | baseline_data = kwargs.get('baseline_data') 37 | else: 38 | baseline_data = generate_baseline(raw_data, mode, smoothing=smoothing, 39 | transform=transform, **kwargs) 40 | return raw_data - baseline_data 41 | 42 | 43 | def generate_baseline(raw_data, mode, smoothing=True, transform=False, 44 | **kwargs): 45 | """ 46 | Calculate baseline data on input datasets with different algorithms. 47 | 48 | Input data: 49 | ----------- 50 | raw_data: ndarray 51 | Numpy 2D array of shape (N, M) with N datasets and M data points per 52 | dataset. If only one dataset is given, it has to have the shape (1, M). 53 | mode: str 54 | Algorithm for baseline calculation. Allowed values: 55 | 'convex_hull', 'ALSS', 'iALSS', 'drPLS', 'SNIP', 'ModPoly', 'IModPoly', 56 | 'PPF', 'from_measurement'. 57 | smoothing: bool 58 | True if datasets should be smoothed before calculation (recommended), 59 | otherwise False. 60 | transform: bool 61 | True if datasets should be transformed before calculation, 62 | otherwise False. 63 | 64 | kwargs for smoothing == True 65 | --------------------------- 66 | savgol_window: int 67 | window size for Savitzky-Golay window, default=19. 68 | savgol_order: int 69 | polynomial order for Savitzky-Golay filter, default=2. 70 | 71 | kwargs for transform == True 72 | --------------------------- 73 | currently none, but will to be added in future versions. 74 | 75 | kwargs for different baseline modes: 76 | ------------------------------------ 77 | convex_hull: 78 | wavenumbers: ndarray 79 | Numpy array containing wavenumbers or wavelengths of datasets. 80 | Must have M elements and must be sorted. default=np.arange(M) 81 | ALSS: 82 | lam: float 83 | default=10000 84 | p: float 85 | default=0.001 86 | n_iter: int 87 | default=10 88 | conv_crit: float 89 | default=0.001 90 | iALSS: 91 | lam: float 92 | default=2000 93 | lam_1: float 94 | default=0.01 95 | p: float 96 | default=0.01 97 | n_iter: int 98 | default=10 99 | conv_crit: float 100 | default=0.001 101 | wavenumbers: ndarray 102 | Numpy array containing wavenumbers or wavelengths of datasets. 103 | Must have M elements. default=np.arange(M) 104 | drPLS: 105 | lam: float 106 | default=1000000 107 | eta: float 108 | default=0.5 109 | n_iter: int 110 | default=100 111 | conv_crit: float 112 | default=0.001 113 | SNIP: 114 | n_iter: int 115 | default=100 116 | ModPoly, IModPoly: 117 | wavenumbers: ndarray 118 | Numpy array containing wavenumbers or wavelengths of datasets. 119 | Must have M elements and must be sorted. default=np.arange(M) 120 | n_iter: int 121 | default=100 122 | poly_order: int 123 | default=5 124 | fixed_points: list of tuples, optional 125 | Contains constraints for points that the baseline must 126 | pass through. Each point is given by a tuple of two numbers, 127 | the wavenumber and the intensity of the point. If no point 128 | constraints are to be applied, this must be None. The 129 | default is None. 130 | fixed_slopes: list of tuples, optional 131 | Contains constraints for slopes that the fit functions must 132 | have at specific wavenumbers. Each slope is given by a tuple of 133 | two numbers, the wavenumber and the slope. If no slope 134 | constraints are to be applied, this must be None. The 135 | default is None. 136 | PPF: 137 | wavenumbers: ndarray, optional 138 | Numpy array containing wavenumbers or wavelengths of datasets. 139 | Must have M elements and must be sorted. default=np.arange(M). 140 | n_iter: int, optional 141 | default=100 142 | segment_borders : list of int or float, optional 143 | The values with respect to wavenumbers at which the data is divided 144 | into segments. An arbitrary number of segment borders may be given, 145 | but it is recommended to provide a sorted list in order to avoid 146 | confusion. If the list is not sorted, it will be sorted. The 147 | default is [wavenumbers[len(wavenumbers)//2]], resulting in a 148 | segmentation in the middle of the data. 149 | poly_orders : list of int, optional 150 | A list containing the polynomial orders used for the baseline fit. 151 | Must contain one more element than segment_borders. Default is 152 | [3, 3]. 153 | fit_method: str, optional 154 | Defines if the polynomial baseline fit of the segments is 155 | performed by the ModPoly ('ModPoly') or IModPoly ('IModPoly') 156 | algorithm. Default is 'ModPoly'. 157 | y_at_borders : None, or list of float or None, or 'int_at_borders', 158 | optional 159 | May contain dependent variable values used as equality constraints 160 | at the segment borders. The fits of both touching segments are 161 | forced through the point given by the pair (segment border, 162 | y_at_border). The list entries may also be None to state that at a 163 | certain segment border, no constraint is to be applied. The default 164 | is 'int_at_borders' which is the intensity value at the 165 | segment_borders. 166 | from_measurement: 167 | measured_baseline : ndarray, optional 168 | The array contains known baseline data obtained through a 169 | measurement valid for all datasets in raw_data. The length of the 170 | array has to match the number of M data points in raw_data. The 171 | default is a zero filled array, so this is only meaningful if 172 | some baseline data is passed. Otherwise a zero baseline is 173 | returned. The returned array has the same shape like raw_data and 174 | can be subtracted from it directly. This is probably not so 175 | terribly useful. 176 | """ 177 | # Optionally, spectrum data is smoothed before beaseline calculation. This 178 | # makes sense especially for baseline generation methods that have problems 179 | # with noise. Currently Savitzky-Golay only. 180 | if smoothing: 181 | savgol_window = kwargs.get('savgol_window', 9) 182 | savgol_order = kwargs.get('savol_order', 2) 183 | raw_data = smooth_spectra(raw_data, 'sav_gol', 184 | savgol_points=savgol_window, 185 | poly_order=savgol_order) 186 | 187 | # Transformation makes sense for spectra that cover a broad range of peak 188 | # intensities. Otherwise, small peaks may be more or less ignored during 189 | # baseline calculation. Currently LLS transformation only. 190 | if transform: 191 | spectra_minimum_value = raw_data.min() 192 | raw_data = transform_spectra(raw_data, 'log_log_sqrt') 193 | 194 | # wavenumbers are used for convex_hull, ModPoly, IModPoly, PPF, iALSS 195 | if 'wavenumbers' in kwargs: 196 | wavenumbers = kwargs.get('wavenumbers') 197 | ascending_wn = (wavenumbers[1]-wavenumbers[0]) > 0 198 | else: 199 | wavenumbers = np.arange(raw_data.shape[1]) 200 | ascending_wn = True 201 | 202 | baseline_data = np.zeros_like(raw_data) 203 | baseline_modes = ['convex_hull', 'ALSS', 'iALSS', 'drPLS', 'SNIP', 204 | 'ModPoly', 'IModPoly', 'PPF', 'from_measurement'] 205 | 206 | if mode == baseline_modes[0]: # convex_hull 207 | # based on (but improved a bit) 208 | # https://dsp.stackexchange.com/questions/2725/ 209 | # how-to-perform-a-rubberband-correction-on-spectroscopic-data 210 | 211 | if ascending_wn: 212 | raw_data = np.flip(raw_data, axis=1) 213 | wavenumbers = np.flip(wavenumbers) 214 | 215 | for ii, current_spectrum in enumerate(tqdm(raw_data)): 216 | hull_vertices = ConvexHull( 217 | np.array(list(zip(wavenumbers, current_spectrum)))).vertices 218 | 219 | # Rotate convex hull vertices until they start from the lowest one 220 | hull_vertices = np.roll(hull_vertices, -np.argmin(hull_vertices)) 221 | 222 | # split vertices into upper and lower part 223 | hull_vertices_section_1 = hull_vertices[:np.argmax(hull_vertices) 224 | + 1] 225 | hull_vertices_section_2 = np.sort( 226 | np.insert(hull_vertices[np.argmax(hull_vertices):], 0, 227 | hull_vertices[0])) 228 | 229 | # calculate spectrum mean intensities of upper and lower vertices 230 | raw_mean_1 = np.mean(current_spectrum[hull_vertices_section_1]) 231 | raw_mean_2 = np.mean(current_spectrum[hull_vertices_section_2]) 232 | 233 | # Select lower vertices as baseline vertices 234 | if raw_mean_1 > raw_mean_2: 235 | baseline_vertices = hull_vertices_section_2 236 | else: 237 | baseline_vertices = hull_vertices_section_1 238 | 239 | # Create baseline using linear interpolation between vertices 240 | baseline_data[ii, :] = np.interp( 241 | wavenumbers, np.flip(wavenumbers[baseline_vertices]), 242 | np.flip(current_spectrum[baseline_vertices])) 243 | 244 | if ascending_wn: 245 | baseline_data = np.flip(baseline_data, axis=1) 246 | 247 | elif mode == baseline_modes[1]: # ALSS 248 | # according to 249 | # "Baseline Correction with Asymmetric Least Squares Smoothing" 250 | # by P. Eilers and H. Boelens. 251 | # https://zanran_storage.s3.amazonaws.com/www.science.uva.nl/ 252 | # ContentPages/443199618.pdf 253 | 254 | # set mode specific parameters 255 | lam = kwargs.get('lam', 10000) 256 | p = kwargs.get('p', 0.001) 257 | n_iter = kwargs.get('n_iter', 10) 258 | conv_crit = kwargs.get('conv_crit', 0.001) 259 | ############################# 260 | 261 | L = raw_data.shape[1] 262 | D = diags([1, -2, 1], [0, -1, -2], shape=(L, L-2), format='csr') 263 | D = D.dot(D.transpose()) 264 | 265 | for ii, current_spectrum in enumerate(tqdm(raw_data)): 266 | 267 | # this is the code for the fitting procedure 268 | w = np.ones(L) 269 | W = diags(w, format='csr') 270 | z = w 271 | 272 | for jj in range(int(n_iter)): 273 | W.setdiag(w) 274 | Z = W + lam * D 275 | z_prev = z 276 | z = spsolve(Z, w*current_spectrum, permc_spec='NATURAL') 277 | if np.linalg.norm(z - z_prev) > conv_crit: 278 | w = p * (current_spectrum > z) + (1-p) * ( 279 | current_spectrum < z) 280 | else: 281 | break 282 | # end of fitting procedure 283 | 284 | baseline_data[ii, :] = z 285 | 286 | elif mode == baseline_modes[2]: # iALSS 287 | # according to "Anal. Methods, 2014, 6, 4402–4407." 288 | 289 | # set mode specific parameters 290 | lam = kwargs.get('lam', 2000) 291 | lam_1 = kwargs.get('lam_1', 0.01) 292 | p = kwargs.get('p', 0.01) 293 | n_iter = kwargs.get('n_iter', 10) 294 | conv_crit = kwargs.get('conv_crit', 0.001) 295 | ############################# 296 | 297 | L = raw_data.shape[1] 298 | fit_coeffs = np.polynomial.polynomial.polyfit(wavenumbers, 299 | raw_data.T, 2) 300 | w_start_all = np.polynomial.polynomial.polyval(wavenumbers, fit_coeffs) 301 | 302 | D = diags([1, -2, 1], [0, -1, -2], shape=(L, L-2), format='csr') 303 | D = D.dot(D.transpose()) 304 | D_1 = diags([-1, 1], [0, -1], shape=(L, L-1), format='csr') 305 | D_1 = D_1.dot(D_1.transpose()) 306 | 307 | for ii, current_spectrum in enumerate(tqdm(raw_data)): 308 | 309 | # this is the code for the fitting procedure 310 | w = w_start_all[ii, :] 311 | z = w 312 | W = diags(w, format='csr') 313 | w = p * (current_spectrum > z) + (1-p) * (current_spectrum < z) 314 | 315 | for jj in range(int(n_iter)): 316 | W.setdiag(w) 317 | W = W.dot(W.transpose()) 318 | Z = W + lam_1 * D_1 + lam * D 319 | R = (W + lam_1 * D_1) * current_spectrum 320 | z_prev = z 321 | z = spsolve(Z, R, permc_spec='NATURAL') 322 | if np.linalg.norm(z - z_prev) > conv_crit: 323 | w = p * (current_spectrum > z) + (1-p) * ( 324 | current_spectrum < z) 325 | else: 326 | break 327 | # end of fitting procedure 328 | 329 | baseline_data[ii, :] = z 330 | 331 | elif mode == baseline_modes[3]: # drPLS 332 | # according to "Applied Optics, 2019, 58, 3913-3920." 333 | 334 | # set mode specific parameters 335 | lam = kwargs.get('lam', 1000000) 336 | eta = kwargs.get('eta', 0.5) 337 | n_iter = kwargs.get('n_iter', 100) 338 | conv_crit = kwargs.get('conv_crit', 0.001) 339 | ############################# 340 | 341 | L = raw_data.shape[1] 342 | 343 | D = diags([1, -2, 1], [0, -1, -2], shape=(L, L-2), format='csr') 344 | D = D.dot(D.transpose()) 345 | D_1 = diags([-1, 1], [0, -1], shape=(L, L-1), format='csr') 346 | D_1 = D_1.dot(D_1.transpose()) 347 | 348 | w_0 = np.ones(L) 349 | I_n = diags(w_0, format='csr') 350 | 351 | for ii, current_spectrum in enumerate(tqdm(raw_data)): 352 | 353 | # this is the code for the fitting procedure 354 | w = w_0 355 | W = diags(w, format='csr') 356 | Z = w_0 357 | 358 | for jj in range(int(n_iter)): 359 | W.setdiag(w) 360 | Z_prev = Z 361 | Z = spsolve(W + D_1 + lam * (I_n - eta*W) * 362 | D, W*current_spectrum, permc_spec='NATURAL') 363 | if np.linalg.norm(Z - Z_prev) > conv_crit: 364 | d = current_spectrum - Z 365 | d_negative = d[d < 0] 366 | sigma_negative = np.std(d_negative) 367 | mean_negative = np.mean(d_negative) 368 | w = 0.5 * (1 - np.exp(jj) * (d - ( 369 | -mean_negative + 2*sigma_negative))/sigma_negative / ( 370 | 1 + np.abs(np.exp(jj) * (d - ( 371 | - mean_negative + 2 * sigma_negative)) / 372 | sigma_negative))) 373 | else: 374 | break 375 | # end of fitting procedure 376 | 377 | baseline_data[ii, :] = Z 378 | 379 | elif mode == baseline_modes[4]: # SNIP 380 | # according to "Nuclear Instruments and Methods in Physics Research 381 | # 934 (1988) 396-402." 382 | # and Nuclear Instruments and Methods in Physics Research Section A: 383 | # Accelerators, Spectrometers, Detectors and Associated Equipment 1997, 384 | # 401 (1), 113-132 385 | 386 | # set mode specific parameters 387 | n_iter = kwargs.get('n_iter', 100) 388 | ############################# 389 | 390 | spectrum_points = raw_data.shape[1] 391 | working_spectra = np.zeros_like(raw_data) 392 | 393 | for pp in tqdm(np.arange(1, n_iter+1)): 394 | r1 = raw_data[:, pp:spectrum_points-pp] 395 | r2 = (np.roll(raw_data, -pp, axis=1)[:, pp:spectrum_points-pp] + 396 | np.roll(raw_data, pp, axis=1)[:, pp:spectrum_points-pp])/2 397 | working_spectra = np.minimum(r1, r2) 398 | raw_data[:, pp:spectrum_points-pp] = working_spectra 399 | 400 | baseline_data = raw_data 401 | 402 | elif mode in baseline_modes[5:8]: # ModPoly, IModPoly, PPF 403 | # according to Applied Spectroscopy, 2007, 61 (11), 1225-1232. 404 | # without dev: Chemometrics and Intelligent Laboratory Systems 82 405 | # (2006) 59– 65. 406 | # Maybe also ModPoly from first source? 407 | 408 | # set mode specific parameters 409 | n_iter = kwargs.get('n_iter', 100) 410 | if mode in baseline_modes[5:7]: # ModPoly, IModPoly 411 | poly_order = kwargs.get('poly_order', 5) 412 | fixed_points = kwargs.get('fixed_points', None) 413 | fixed_slopes = kwargs.get('fixed_slopes', None) 414 | if mode == baseline_modes[7]: # PPF 415 | segment_borders = kwargs.get( 416 | 'segment_borders', [wavenumbers[len(wavenumbers)//2]]) 417 | 418 | poly_orders = kwargs.get('poly_orders', [3, 3]) 419 | y_at_borders = kwargs.get('y_at_borders', 'int_at_borders') 420 | fit_method = kwargs.get('fit_method', 'ModPoly') 421 | ############################# 422 | 423 | if not ascending_wn: 424 | raw_data = np.flip(raw_data, axis=1) 425 | wavenumbers = np.flip(wavenumbers) 426 | 427 | wavenumbers_start = wavenumbers 428 | # previous_dev = 0 429 | 430 | for ii, current_spectrum in enumerate(tqdm(raw_data)): 431 | wavenumbers = wavenumbers_start 432 | 433 | if mode == baseline_modes[7]: # 'PPF' 434 | if y_at_borders == 'int_at_borders': 435 | y_at_borders_values = y_at_x( 436 | segment_borders, wavenumbers, current_spectrum) 437 | else: 438 | y_at_borders_values = y_at_borders 439 | 440 | for jj in range(int(n_iter)): 441 | if mode in baseline_modes[5:7]: # ModPoly, IModPoly 442 | # The polynomial_fit method from pyDataFitting is only used 443 | # if constraints are to be considered because the numpy 444 | # polyfit method is faster. 445 | if (fixed_points is not None) or ( 446 | fixed_slopes is not None): 447 | fit_data, fit_coeffs = polynomial_fit( 448 | wavenumbers, current_spectrum, poly_order, 449 | fixed_points=fixed_points, 450 | fixed_slopes=fixed_slopes) 451 | else: 452 | fit_coeffs = np.polynomial.polynomial.polyfit( 453 | wavenumbers, current_spectrum, poly_order) 454 | fit_data = np.polynomial.polynomial.polyval( 455 | wavenumbers, fit_coeffs) 456 | else: # PPF 457 | fit_data, fit_coeffs = piecewise_polynomial_fit( 458 | wavenumbers, current_spectrum, segment_borders, 459 | poly_orders, y_at_borders=y_at_borders_values, 460 | slope_at_borders=None) 461 | 462 | # ModPoly or PPF with ModPoly 463 | if (mode == baseline_modes[5]) or ( 464 | (mode == baseline_modes[7]) and (fit_method=='ModPoly') 465 | ): 466 | dev = 0 467 | # IModPoly or PPF with IModPoly 468 | else: 469 | residual = current_spectrum - fit_data 470 | dev = residual.std() 471 | # if abs((dev - previous_dev)/dev) < 0.01: 472 | # break 473 | 474 | if jj == 0: 475 | mask = (current_spectrum <= fit_data + dev) 476 | wavenumbers = wavenumbers[mask] 477 | current_spectrum = current_spectrum[mask] 478 | fit_data = fit_data[mask] 479 | np.copyto(current_spectrum, fit_data + dev, 480 | where=(current_spectrum >= (fit_data+dev))) 481 | # previous_dev = dev 482 | 483 | if mode in baseline_modes[5:7]: # ModPoly, IModPoly 484 | baseline_data[ii, :] = np.polynomial.polynomial.polyval( 485 | wavenumbers_start, fit_coeffs) 486 | else: # PPF 487 | baseline_data[ii, :] = piecewise_polynomial( 488 | wavenumbers_start, fit_coeffs, 489 | segment_borders=segment_borders) 490 | 491 | if not ascending_wn: 492 | baseline_data = np.flip(baseline_data, axis=1) 493 | # raw_data = np.flip(raw_data, axis=1) 494 | 495 | elif mode == baseline_modes[8]: # from_measurement 496 | spectrum_number = len(raw_data) 497 | spectrum_points = raw_data.shape[1] 498 | measured_baseline = kwargs.get( 499 | 'measured_baseline', np.zeros(spectrum_points)) 500 | if len(measured_baseline) != spectrum_points: 501 | raise ValueError( 502 | 'The given baseline data consists of {} data points, but {} ' 503 | 'were expected due to the shape of raw_data.'.format( 504 | len(measured_baseline), spectrum_points)) 505 | baseline_data = np.tile(measured_baseline, spectrum_number).reshape( 506 | spectrum_number, -1) 507 | 508 | else: 509 | raise ValueError('No valid baseline mode entered. Allowed modes are ' 510 | '{0}'.format(baseline_modes)) 511 | 512 | if transform: 513 | baseline_data = transform_spectra( 514 | baseline_data, 'log_log_sqrt', direction='inverse', 515 | min_value=spectra_minimum_value) 516 | 517 | return np.around(baseline_data, decimals=6) 518 | -------------------------------------------------------------------------------- /src/pyPreprocessing/smoothing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Provides functions for smoothing and filtering of data rows oganized in 2D 4 | numpy arrays. 5 | """ 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from scipy.signal import savgol_filter 10 | from scipy.interpolate import interp1d 11 | from sklearn.decomposition import PCA 12 | 13 | 14 | def smoothing(raw_data, mode, interpolate=False, point_mirror=True, **kwargs): 15 | """ 16 | Smoothes data rows with different algorithms. 17 | 18 | Parameters 19 | ---------- 20 | raw_data : ndarray 21 | 2D numpy array with the shape (N,M) containing N data rows to be 22 | smoothed. Each data row is represented by row in numpy array and 23 | contains M values. If only one data row is present, raw_data has the 24 | shape (1,M). 25 | mode : str 26 | Algorithm used for smoothing. Allowed modes are 'sav_gol' for Savitzky- 27 | Golay, 'rolling_median' for a median filter, 'pca' for smoothing based 28 | on principal component analysis, 'weighted_moving_average' for a 29 | moving average that uses weights, so e.g. can decide if values in the 30 | window are used for or excluded from averaging. 31 | interpolate : boolean 32 | False if x coordinate is evenly spaced. True if x coordinate is not 33 | evenly spaced, then raw_data is interpolated to an evenly spaced 34 | x coordinate. Default is False 35 | point_mirror : boolean 36 | Dataset is point reflected at both end points before smoothing to 37 | reduce artifacts at the data edges. 38 | **kwargs for interpolate=True 39 | x_coordinate : ndarray 40 | 1D numpy array with shape (M,) used for interpolation. 41 | data_points : int, optional 42 | number of data points returned after interpolation. Default is one 43 | order of magnitude more than M. 44 | return_type : string, optional 45 | Defines if the interpolated dataset with a number of data_points 46 | is returned ('interp') or if the returned dataset has the same 47 | dimensions and x_coordinates like the original dataset ('orig'). 48 | Default is 'interp'. 49 | **kwargs for different smoothing modes 50 | sav_gol: 51 | deriv : int 52 | Derivative order to be calculated. Default is 0 (no 53 | derivative). 54 | savgol_points : int 55 | Number of point defining one side of the Savitzky-Golay window. 56 | Total window is 2*savgol_points+1. Default is 9. 57 | poly_order : int 58 | Polynomial order used for polynomial fitting of the Savitzky- 59 | Golay window. Default is 2. 60 | savgol_mode : str 61 | Must be ‘mirror’, ‘constant’, ‘nearest’, ‘wrap’ or ‘interp’. 62 | See documentation of scipy.signal.savgol_filter. 63 | rolling_median: 64 | window: int 65 | Data points included in rolling window used for median 66 | calculations. Default is 5. 67 | pca: 68 | pca_components : int 69 | Number of principal components used to reconstruct the original 70 | data. Default is 5. 71 | weighted_moving_average: 72 | weights : list of float 73 | The number of entries decide the window length used for 74 | smoothing. A value > 0 means that the value is used with the 75 | specified weight, a value of 0 means the value is excluded, 76 | e.g. [1, 0, 1] is a window of size 3 in which the center point 77 | is exluded from the calculations. Default is [1, 1, 0, 1, 1]. 78 | 79 | Returns 80 | ------- 81 | ndarray or tuple of ndarrays 82 | 2D numpy array containing the smoothed data in the same shape as 83 | raw_data if interpolate is false. Else tuple containing interpolated 84 | x coordinates and 2D numpy array in the shape of 85 | (N,10**np.ceil(np.log10(len(x_coordinate)))). In case of mode is 86 | weighted_moving_average, the corresponding standard deviations are 87 | also calulated and a tuple with the smoothed data and the standard 88 | deviations is returned. 89 | 90 | """ 91 | # copy of raw_data for later restoration of data edges 92 | raw_old = pd.DataFrame(raw_data.copy()) 93 | # Preprocessing of input data for unevenly spaced x coordinate 94 | if interpolate: 95 | x_coordinate = kwargs.get('x_coordinate', np.linspace( 96 | 0, 1000, raw_data.shape[1])) 97 | data_points = kwargs.get('data_points', 98 | int(10**np.ceil(np.log10(len(x_coordinate))))) 99 | 100 | itp = interp1d(x_coordinate, raw_data, kind='linear') 101 | x_interpolated = np.linspace(x_coordinate[0], x_coordinate[-1], 102 | data_points) 103 | raw_data = itp(x_interpolated) 104 | 105 | # Optional extension of smoothed data by point mirrored raw data. 106 | if point_mirror: 107 | raw_data = np.concatenate( 108 | ((-np.flip(raw_data, axis=1)+2*raw_data[:, 0, np.newaxis])[:, :-1], 109 | raw_data, (-np.flip(raw_data, axis=1) + 110 | 2*raw_data[:, -1, np.newaxis])[:, 1:]), axis=1) 111 | #raw_data = np.concatenate((-np.squeeze(raw_data.T)[::-1]+2*np.squeeze(raw_data.T)[0],np.squeeze(raw_data.T),-np.squeeze(raw_data.T)[::-1]+2*np.squeeze(raw_data.T)[-1]))[np.newaxis] 112 | 113 | smoothing_modes = ['sav_gol', 'rolling_median', 'pca', 114 | 'weighted_moving_average'] 115 | 116 | if mode == smoothing_modes[0]: # sav_gol 117 | deriv = kwargs.get('deriv', 0) 118 | savgol_points = kwargs.get('savgol_points', 9) 119 | poly_order = kwargs.get('poly_order', 2) 120 | savgol_mode = kwargs.get('savgol_mode', 'nearest') 121 | 122 | smoothed_data = savgol_filter(raw_data, 1+2*savgol_points, poly_order, 123 | deriv=deriv, axis=1, mode=savgol_mode) 124 | 125 | elif mode == smoothing_modes[1]: # rolling_median 126 | window = kwargs.get('window', 5) 127 | # next line due to pandas rolling window, look for numpy solution 128 | raw_data = pd.DataFrame(raw_data) 129 | 130 | edge_value_count = int((window-1)/2) 131 | smoothed_data = raw_data.T.rolling( 132 | window, center=True).median().T.iloc[ 133 | :, edge_value_count:-edge_value_count] 134 | 135 | # On the data edges, the original data is used, so the edges are not 136 | # smoothed (only relevant if point_mirror is False). 137 | smoothed_data = pd.concat( 138 | [raw_old.iloc[:, 0:edge_value_count], smoothed_data, 139 | raw_old.iloc[:, -1-edge_value_count:]], axis=1).values 140 | 141 | elif mode == smoothing_modes[2]: # pca 142 | pca_components = kwargs.get('pca_components', 5) 143 | 144 | pca = PCA(n_components=pca_components) 145 | scores = pca.fit_transform(raw_data) 146 | loadings = pca.components_ 147 | 148 | smoothed_data = ( 149 | np.dot(scores, loadings) + np.mean(raw_data, axis=0)) 150 | 151 | elif mode == smoothing_modes[3]: # weighted_moving_average 152 | weights = kwargs.get('weights', [1, 1, 0, 1, 1]) 153 | 154 | window_size = len(weights) 155 | value_count = raw_data.shape[1] 156 | edge_value_count = int((window_size-1)/2) 157 | remaining_values = value_count-window_size+1 158 | 159 | column_indices = np.repeat( 160 | np.arange(window_size)[np.newaxis], remaining_values, axis=0 161 | ) + np.arange(remaining_values)[:, np.newaxis] 162 | # column_indices = column_indices[:, weights] 163 | 164 | # the following step multiplies the total value number with 165 | # window_size, so might be problematic for large datasets 166 | value_array = np.squeeze(raw_data[np.newaxis][:, :, column_indices]) 167 | if len(value_array.shape) == 2: 168 | value_array = value_array[np.newaxis] 169 | smoothed_data, selective_std = weighted_mean_std(value_array, weights) 170 | smoothed_data = pd.DataFrame(smoothed_data) 171 | 172 | # selective_std = np.std(value_array, axis=2) 173 | # On the edges, the std is calculated from the reduced number of edge 174 | # data points (only relevant if point_mirror is False). 175 | selective_std = np.concatenate(( 176 | np.repeat(np.std(raw_old.values[:, 0:edge_value_count], axis=1), 177 | edge_value_count).reshape(-1, edge_value_count), 178 | selective_std, 179 | np.repeat(np.std(raw_old.values[:, -edge_value_count:], axis=1), 180 | edge_value_count).reshape(-1, edge_value_count) 181 | ), axis=1) 182 | 183 | # On the data edges, the original data is used, so the edges are not 184 | # smoothed (only relevant if point_mirror is False). 185 | raw_data = pd.DataFrame(raw_data) 186 | smoothed_data = pd.concat( 187 | [raw_old.iloc[:, 0:edge_value_count], smoothed_data, 188 | raw_old.iloc[:, -edge_value_count:]], axis=1).values 189 | 190 | else: 191 | raise ValueError('No valid smoothing mode entered. Allowed modes are ' 192 | '{0}'.format(smoothing_modes)) 193 | 194 | # Removal of previously added point mirrored data. 195 | if point_mirror: 196 | smoothed_data = smoothed_data[ 197 | :, int(np.ceil(smoothed_data.shape[1]/3)-1): 198 | int(2*np.ceil(smoothed_data.shape[1]/3)-1)] 199 | if mode == smoothing_modes[3]: # weighted_moving_average 200 | selective_std = selective_std[ 201 | :, int(np.ceil(selective_std.shape[1]/3)-1): 202 | int(2*np.ceil(selective_std.shape[1]/3)-1)] 203 | 204 | if interpolate: 205 | return_type = kwargs.get('return_type', 'interp') 206 | if return_type == 'interp': 207 | return (x_interpolated, smoothed_data) 208 | elif return_type == 'orig': 209 | f = interp1d(x_interpolated, smoothed_data, kind='linear') 210 | return (x_coordinate, f(x_coordinate)) 211 | else: 212 | raise ValueError('No valid return_type given.') 213 | elif mode == smoothing_modes[3]: # weighted_moving_average 214 | return (smoothed_data, selective_std) 215 | else: 216 | return smoothed_data 217 | 218 | 219 | def weighted_mean_std(values, weights, std=True): 220 | """ 221 | Calculate the weighted mean and (biased) standard deviation of values. 222 | 223 | Parameters 224 | ---------- 225 | values : ndarray 226 | An n-dimensional array in the shape (..., M) with data rows with M 227 | elements. Calculations are performed for each data row in the last 228 | dimension of values. 229 | weights : list of float 230 | A list containing the weights used in the calculations. Must contain 231 | M elements. 232 | std : bool, optional 233 | Decides if the weighted standard deviation is also calculated, default 234 | is True. 235 | 236 | Returns 237 | ------- 238 | weighted_mean : ndarray 239 | An (n-1)-dimensional array containing the weighted means for the data 240 | rows, so has the shape of values without the last dimension. 241 | weighted_std : ndarray 242 | An (n-1)-dimensional array containing the weighted standard deviations 243 | for the data rows, so has the shape of values without the last 244 | dimension. Only in case of std=True. 245 | 246 | """ 247 | weighted_mean = np.average(values, weights=weights, axis=-1) 248 | if std: 249 | weighted_std = np.sqrt( 250 | np.average((values-weighted_mean[..., np.newaxis])**2, 251 | weights=weights, axis=-1)) 252 | return (weighted_mean, weighted_std) 253 | else: 254 | return weighted_mean 255 | 256 | 257 | def filtering(raw_data, mode, fill='NaN', **kwargs): 258 | """ 259 | Filter data rows with different algorithms. 260 | 261 | Filtered values are replaced by np.nan. 262 | 263 | Parameters 264 | ---------- 265 | raw_data : ndarray 266 | 2D numpy array with the shape (N,M) containing N data rows to be 267 | filtered. Each data row is represented by row in numpy array and 268 | contains M values. If only one data row is present, raw_data has the 269 | shape (1, M). 270 | mode : str 271 | Algorithm used for filtering. Allowed modes are 'spike_filter' for 272 | sharp peaks, 'max_thresh' for removal of values above or equal to a 273 | maximum threshold, 'min_thresh' for removal of values below or equal to 274 | a minumum threshold. 275 | fill : str, optional 276 | Decides the way filtered points are replaced. Currently 'NaN' 277 | where values are replaced by np.nan, 'zeros' where values are 278 | replaced by zeros, or 'mov_avg' (only for mode=='spike_filter') where 279 | values are replaced by the weighted moving average. 280 | **kwargs for different filter modes 281 | spike_filter: 282 | weights : list of float, optional 283 | The number of entries decide the window length used for 284 | smoothing. A value > 0 means that the value is used with the 285 | specified weight, a value of 0 means the value is excluded, 286 | e.g. [1, 0, 1] is a window of size 3 in which the center point 287 | is exluded from the calculations. Default is [1, 1, 0, 1, 1]. 288 | std_factor : float, optional 289 | The number of standard deviations a value is allowed to be away 290 | from the moving average before it is removed by the filter. 291 | Mean and standard deviation are calculated in a rolling fashion 292 | so that only sharp peaks are found. Default is 2. 293 | point_mirror : bool, optional 294 | Decides if the data edges are point mirrored before rolling 295 | average. If True, estimates of mean and standard deviation also 296 | at the edges are obtained. If False, data at the edges are kept 297 | like in the original. Default is False. 298 | interpolate : boolean, optional 299 | False if x coordinate is evenly spaced. True if x coordinate is 300 | not evenly spaced, then raw_data is interpolated to an evenly 301 | spaced x coordinate. Default is False 302 | max_thresh 303 | max:_thresh : float, optional 304 | The maximum threshold. Default is 1000. 305 | min_thresh 306 | min_thresh : float, optional 307 | The minimum threshold. Default is 0. 308 | 309 | Returns 310 | ------- 311 | ndarray 312 | Returns an ndarray with dimensions like raw_data. Filtered points are 313 | changed according to the fill selected. 314 | 315 | """ 316 | filter_modes = ['spike_filter', 'max_thresh', 'min_thresh'] 317 | fill_values = ['NaN', 'zeros', 'mov_avg'] 318 | if fill == 'NaN': 319 | fill_value = np.nan 320 | elif fill == 'zeros': 321 | fill_value = 0 322 | elif ((fill not in fill_values) or 323 | (fill == 'mov_avg' and mode != filter_modes[0])): 324 | raise ValueError('No valid fill value given for this mode.') 325 | 326 | if mode == filter_modes[0]: # spike_filter 327 | weights = kwargs.get('weights', [1, 1, 0, 1, 1]) 328 | window_size = len(weights) 329 | std_factor = kwargs.get('std_factor', 2) 330 | point_mirror = kwargs.get('point_mirror', False) 331 | interpolate = kwargs.get('interpolate', False) 332 | 333 | filtered_data = raw_data.copy() 334 | mov_avg, mov_std = smoothing( 335 | filtered_data, 'weighted_moving_average', 336 | point_mirror=point_mirror, interpolate=interpolate, 337 | weights=weights) 338 | 339 | diffs = np.absolute(filtered_data - mov_avg) 340 | 341 | if fill == 'mov_avg': 342 | fill_value = mov_avg[diffs > std_factor*mov_std] 343 | filtered_data[diffs > std_factor*mov_std] = fill_value 344 | # filtered_data = raw_data 345 | 346 | elif mode == filter_modes[1]: # max_thresh 347 | maximum_threshold = kwargs.get('max_thresh', 1000) 348 | filtered_data = raw_data.copy().astype(float) 349 | filtered_data[filtered_data > maximum_threshold] = fill_value 350 | # filtered_data = raw_data 351 | 352 | elif mode == filter_modes[2]: # min_thresh 353 | minimum_threshold = kwargs.get('min_thresh', 0) 354 | filtered_data = raw_data.copy().astype(float) 355 | filtered_data[filtered_data < minimum_threshold] = fill_value 356 | # filtered_data = raw_data 357 | 358 | else: 359 | raise ValueError('No valid filter mode entered. Allowed modes are ' 360 | '{0}'.format(filter_modes)) 361 | 362 | return filtered_data 363 | -------------------------------------------------------------------------------- /src/pyPreprocessing/transform.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Provides functions for data transformation (currently only LLS) and 4 | normalization. 5 | """ 6 | 7 | import numpy as np 8 | 9 | from little_helpers.array_tools import closest_index 10 | 11 | 12 | def transform(raw_data, mode, direction='direct', **kwargs): 13 | """ 14 | Apply mathematical transformations to data. 15 | 16 | Parameters 17 | ---------- 18 | raw_data : ndarray 19 | 2D numpy array with the shape (N, M) containing N data rows to be 20 | smoothed. Each data row is represented by row in numpy array and 21 | contains M values. If only one data row is present, raw_data has the 22 | shape (1, M). 23 | mode : str 24 | Maths used for transformation. Allowed mode is 'log_log_sqrt' only at 25 | the moment which first takes the square root and then does the 26 | logarithm twice. 27 | direction : str, optional 28 | Gives the direction of the tranformation. If 'direct', the data is 29 | transformed, if 'inverse', the inverse of the transformation is 30 | calculated. The default is 'direct'. 31 | **kwargs for the different modes 32 | mode is 'log_log_sqrt' and direction is 'inverse': 33 | min_value : float 34 | Original minimum value of the data before transformation. Has 35 | to be known because it is lost upon transformation. Default is 36 | 1. 37 | 38 | Raises 39 | ------ 40 | ValueError 41 | If the value passed as mode or direction is not understood. 42 | 43 | Returns 44 | ------- 45 | raw_data : ndarray 46 | Transformed data with the same shape as raw_data. 47 | 48 | """ 49 | # list of allowed modes for data transformation 50 | transform_modes = ['log_log_sqrt'] 51 | 52 | if direction == 'direct': 53 | if mode == transform_modes[0]: 54 | minimum_value = np.min(raw_data) 55 | raw_data -= minimum_value 56 | raw_data = np.log(np.log(np.sqrt(raw_data + 1) + 1) + 1) 57 | else: 58 | raise ValueError('No valid transform mode entered. Allowed modes ' 59 | 'are {0}'.format(transform_modes)) 60 | 61 | elif direction == 'inverse': 62 | if mode == transform_modes[0]: 63 | minimum_value = kwargs.get('min_value', 1) 64 | raw_data = (np.exp(np.exp(raw_data) - 1) - 1)**2 - 1 65 | raw_data += minimum_value 66 | else: 67 | raise ValueError('No valid transform mode entered. Allowed modes ' 68 | 'are {0}'.format(transform_modes)) 69 | else: 70 | raise ValueError('No valid transform direction entered. Allowed ' 71 | 'directions are [\'direct\', \'inverse\']') 72 | 73 | return raw_data 74 | 75 | 76 | def normalize(raw_data, mode, factor=1, **kwargs): 77 | ''' 78 | Normalize data such as spectra to a certain value. 79 | 80 | 81 | Parameters 82 | ---------- 83 | raw_data : ndarray 84 | 2D numpy array with the shape (N, M) containing N data rows to be 85 | normalized. Each data row is represented by row in numpy array and 86 | contains M values. If only one data row is present, raw_data has the 87 | shape (1, M). 88 | mode : string 89 | The mode of data normalization. Allowed modes are 'total_intensity' 90 | (total integral under the data is set to a specific value), 'integral' 91 | (integral under parts of the data is set to a specific value), or 92 | 'max_intensity' (data is divided by maximum intensity). 93 | factor : float, optional 94 | The value the normalized parameter has after the operation. The default 95 | is 1. 96 | **kwargs for the different modes 97 | mode is 'total_intensity' or 'integral': 98 | x_data : ndarray or list 99 | A 1D numpy array or list containing the x data (such as 100 | wavenumbers) corresponding to raw_data. Should be sorted in an 101 | ascending order. 102 | mode is 'integral': 103 | limits : list 104 | A list of two numbers giving the values in x_data which define 105 | the limits of the integration. If this is not given, the mode 106 | 'integral' behaves identical to 'total_intensity'. 107 | 108 | Returns 109 | ------- 110 | normalized_data : ndarray 111 | Normalized data with the same shape as raw_data. 112 | 113 | ''' 114 | raw_data = np.asarray(raw_data) 115 | 116 | # list of allowed modes for normalization 117 | normalize_modes = ['total_intensity', 'integral', 'max_intensity'] 118 | 119 | if mode in normalize_modes[0:2]: # 'total_intensity', 'integral' 120 | if 'x_data' in kwargs: 121 | x_data = np.asarray(kwargs.get('x_data')) 122 | else: 123 | raise TypeError( 124 | 'For mode \'total_intensity\' or \'integral\', x_data must be ' 125 | 'provided.') 126 | 127 | if 'limits' in kwargs: 128 | limits = kwargs.get('limits') 129 | limit_idx = closest_index(limits, x_data) 130 | else: 131 | limit_idx = [0, len(x_data)-1] 132 | 133 | integral = np.trapezoid( 134 | raw_data[:, limit_idx[0]:limit_idx[1]+1], 135 | x=x_data[limit_idx[0]:limit_idx[1]+1], axis=1)[:, np.newaxis] 136 | 137 | conversion_factor = 1/integral 138 | 139 | elif mode == normalize_modes[2]: # 'max_intensity' 140 | conversion_factor = 1/raw_data.max(axis=1)[:, np.newaxis] 141 | 142 | else: 143 | raise ValueError('No valid normalization mode entered. Allowed modes ' 144 | 'are {0}'.format(normalize_modes)) 145 | 146 | normalized_data = raw_data * conversion_factor * factor 147 | return normalized_data 148 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/test_baseline_correction.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Oct 16 19:59:07 2021 5 | 6 | @author: Alexander Southan 7 | """ 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | import unittest 12 | 13 | from src.pyPreprocessing import baseline_correction 14 | 15 | 16 | class TestBaselineCorrection(unittest.TestCase): 17 | 18 | def test_baseline_correction(self): 19 | 20 | # Calculate a simple spectrum 21 | centers = np.array([1200, 1600]) 22 | amps = np.array([200, 100]) 23 | widths = np.array([30, 500]) 24 | noise_factor = 5 25 | 26 | wavenumbers = np.linspace(1000, 2000, 1001) 27 | intensities = ( 28 | (amps*np.exp(-(wavenumbers[:, None]-centers)**2/widths)).sum(axis=1) + 29 | noise_factor * np.random.normal(size=wavenumbers.size)) 30 | background = 0.0001* wavenumbers**2 31 | spectrum = intensities + background 32 | 33 | # Calculate baselines with different methods 34 | baseline_snip = baseline_correction.generate_baseline( 35 | spectrum[None], 'SNIP') 36 | baseline_convhull = baseline_correction.generate_baseline( 37 | spectrum[None], 'convex_hull', wavenumbers=wavenumbers) 38 | baseline_alss = baseline_correction.generate_baseline( 39 | spectrum[None], 'ALSS') 40 | baseline_ialss = baseline_correction.generate_baseline( 41 | spectrum[None], 'iALSS', wavenumbers=wavenumbers) 42 | baseline_drpls = baseline_correction.generate_baseline( 43 | spectrum[None], 'drPLS') 44 | baseline_modpoly = baseline_correction.generate_baseline( 45 | spectrum[None], 'ModPoly', wavenumbers=wavenumbers) 46 | baseline_imodpoly = baseline_correction.generate_baseline( 47 | spectrum[None], 'IModPoly', wavenumbers=wavenumbers) 48 | baseline_ppf = baseline_correction.generate_baseline( 49 | spectrum[None], 'PPF', wavenumbers=wavenumbers) 50 | 51 | spectrum_corrected = baseline_correction.correct_baseline( 52 | spectrum[None], 'SNIP') 53 | 54 | self.assertTrue(np.all(spectrum_corrected == spectrum-baseline_snip)) 55 | 56 | # test with transformation 57 | baseline_transform = baseline_correction.generate_baseline( 58 | spectrum[None], 'SNIP', transform=True) 59 | 60 | # test with flipped spectrum 61 | baseline_snip_desc = baseline_correction.generate_baseline( 62 | spectrum[::-1][None], 'SNIP') 63 | self.assertTrue(np.all(baseline_snip_desc[0, ::-1] == baseline_snip)) 64 | 65 | # test with descending wavenumbers 66 | baseline_modpoly_desc = baseline_correction.generate_baseline( 67 | spectrum[::-1][None], 'ModPoly', wavenumbers=wavenumbers[::-1]) 68 | self.assertTrue( 69 | np.all(baseline_modpoly_desc[0, ::-1] == baseline_modpoly)) 70 | 71 | # PPF with fixed border values 72 | baseline_ppf_fixed = baseline_correction.generate_baseline( 73 | spectrum[None], 'PPF', wavenumbers=wavenumbers, y_at_borders=[250], 74 | segment_borders=[1400.0]) 75 | wn_idx = np.argmax(wavenumbers==1400) 76 | self.assertEqual(baseline_ppf_fixed[0, wn_idx], 250) 77 | 78 | # ModPoly with fixed points 79 | baseline_modpoly_fixed = baseline_correction.generate_baseline( 80 | spectrum[None], 'ModPoly', wavenumbers=wavenumbers, 81 | fixed_points=[[1400, 250]]) 82 | self.assertEqual(baseline_modpoly_fixed[0, wn_idx], 250) 83 | 84 | # test error messages 85 | self.assertRaises(ValueError, baseline_correction.generate_baseline, 86 | spectrum[None], 'ModPoy') 87 | 88 | plt.plot(wavenumbers, spectrum) 89 | plt.plot(wavenumbers, baseline_snip.T, label='SNIP') 90 | plt.plot(wavenumbers, baseline_convhull.T, label='convex hull') 91 | plt.plot(wavenumbers, baseline_alss.T, label='ALSS') 92 | plt.plot(wavenumbers, baseline_ialss.T, label='iALSS') 93 | plt.plot(wavenumbers, baseline_drpls.T, label='drPLS') 94 | plt.plot(wavenumbers, baseline_modpoly.T, label='ModPoly') 95 | plt.plot(wavenumbers, baseline_imodpoly.T, label='IModPoly') 96 | plt.plot(wavenumbers, baseline_ppf.T, label='PPF') 97 | plt.legend() 98 | -------------------------------------------------------------------------------- /tests/test_smoothing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Oct 16 19:59:07 2021 5 | 6 | @author: Alexander Southan 7 | """ 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | import unittest 12 | 13 | from src.pyPreprocessing import smoothing 14 | 15 | 16 | class TestSmoothing(unittest.TestCase): 17 | 18 | def test_smoothing(self): 19 | x = np.linspace(0, 10, 1100) 20 | noise = np.random.normal(size=(50, len(x))) 21 | 22 | x_interp, noise_savgol = smoothing.smoothing( 23 | noise, 'sav_gol', interpolate=True, 24 | x_coordinate=x, return_type='interp', savgol_points=10, 25 | window=15, data_points=1200, point_mirror=True) 26 | self.assertEqual(len(x_interp), len(noise_savgol.T)) 27 | self.assertEqual(len(x_interp), 1200) 28 | self.assertTrue(noise.std() > noise_savgol.std()) 29 | 30 | x_interp_2, noise_savgol_2 = smoothing.smoothing( 31 | noise, 'sav_gol', interpolate=True, 32 | x_coordinate=x, return_type='orig', savgol_points=10, 33 | window=15, data_points=1200, point_mirror=True) 34 | self.assertEqual(len(x_interp_2), len(noise_savgol_2.T)) 35 | self.assertEqual(len(x_interp_2), 1100) 36 | self.assertTrue(noise.std() > noise_savgol_2.std()) 37 | 38 | noise_rollingmedian = smoothing.smoothing( 39 | noise, 'rolling_median', window=10) 40 | self.assertTrue(noise.std() > noise_rollingmedian.std()) 41 | 42 | noise_pca = smoothing.smoothing(noise, 'pca', pca_components=2) 43 | self.assertTrue(noise.std() > noise_pca.std()) 44 | 45 | noise_weightedaverage, _ = smoothing.smoothing( 46 | noise, 'weighted_moving_average') 47 | self.assertTrue(noise.std() > noise_weightedaverage.std()) 48 | 49 | # test with only one dataset 50 | noise_rollingmedian_single = smoothing.smoothing( 51 | noise[[0]], 'rolling_median', window=10) 52 | 53 | # test errors 54 | self.assertRaises(ValueError, smoothing.smoothing, noise[[0]], 55 | 'roling_median') 56 | self.assertRaises(ValueError, smoothing.smoothing, noise, 'sav_gol', 57 | interpolate=True, x_coordinate=x, return_type='irp', 58 | savgol_points=10, window=15, data_points=1100, 59 | point_mirror=True) 60 | 61 | def test_filtering(self): 62 | x = np.linspace(0, 10, 1100) 63 | noise = np.random.normal(size=(50, len(x))) 64 | noise[:, 500] = 300 65 | noise[:, 700] = -300 66 | 67 | # test spike filter 68 | noise_spike = smoothing.filtering(noise, 'spike_filter') 69 | self.assertTrue(np.all(np.isnan(noise_spike[:, 500]))) 70 | self.assertTrue(np.all(np.isnan(noise_spike[:, 700]))) 71 | 72 | noise_spike_2 = smoothing.filtering( 73 | noise, 'spike_filter', fill='mov_avg', 74 | weights=[1, 0.2, 1, 0, 0.5, 1, 1]) 75 | check_avg = (noise[:, 497] + 0.2 * noise[:, 498] + noise[:, 499] + 76 | 0.5 * noise[:, 501] + noise[:, 502] + noise[:, 503])/4.7 77 | self.assertTrue(np.all(noise_spike_2[:, 500]==check_avg)) 78 | 79 | noise_spike_3 = smoothing.filtering( 80 | noise, 'spike_filter', fill='zeros') 81 | self.assertTrue(np.all(noise_spike_3[:, 500] == 0)) 82 | self.assertTrue(np.all(noise_spike_3[:, 700] == 0)) 83 | 84 | # test maximum threshold 85 | noise_maxthresh = smoothing.filtering(noise, 'max_thresh', 86 | max_thresh=299) 87 | self.assertTrue(np.all(np.isnan(noise_maxthresh[:, 500]))) 88 | self.assertFalse(np.any(np.isnan(noise_maxthresh[:, 700]))) 89 | 90 | noise_maxthresh_2 = smoothing.filtering(noise, 'max_thresh', 91 | max_thresh=301) 92 | self.assertFalse(np.any(np.isnan(noise_maxthresh_2[:, 500]))) 93 | self.assertFalse(np.any(np.isnan(noise_maxthresh_2[:, 700]))) 94 | 95 | # test minimum threshold 96 | noise_minthresh = smoothing.filtering(noise, 'min_thresh', 97 | min_thresh=-299) 98 | self.assertFalse(np.any(np.isnan(noise_minthresh[:, 500]))) 99 | self.assertTrue(np.all(np.isnan(noise_minthresh[:, 700]))) 100 | 101 | noise_minthresh_2 = smoothing.filtering(noise, 'min_thresh', 102 | min_thresh=-301) 103 | self.assertFalse(np.any(np.isnan(noise_minthresh_2[:, 500]))) 104 | self.assertFalse(np.any(np.isnan(noise_minthresh_2[:, 700]))) 105 | 106 | # test errors 107 | self.assertRaises(ValueError, smoothing.filtering, noise, 108 | 'max_thresh', fill='mov_avg') 109 | self.assertRaises(ValueError, smoothing.filtering, noise, 110 | 'max_thresh', fill='zero') 111 | self.assertRaises(ValueError, smoothing.filtering, noise, 112 | 'spike_fil') -------------------------------------------------------------------------------- /tests/test_transform.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Oct 16 19:59:07 2021 5 | 6 | @author: Alexander Southan 7 | """ 8 | 9 | import numpy as np 10 | import unittest 11 | 12 | from src.pyPreprocessing import transform 13 | 14 | 15 | class TestTransform(unittest.TestCase): 16 | 17 | def test_transform(self): 18 | x = np.linspace(0, 10, 1100) 19 | y = x**2 -30 20 | 21 | # test lls transformation 22 | y_lls = transform.transform([y], 'log_log_sqrt', direction='direct') 23 | y_lls_inv = transform.transform( 24 | y_lls, 'log_log_sqrt', direction='inverse', min_value=y.min()) 25 | self.assertTrue(np.allclose(y, y_lls_inv[0])) 26 | 27 | # test errors 28 | self.assertRaises( 29 | ValueError, transform.transform, [y], 'log_log_sq', 30 | direction='direct') 31 | self.assertRaises( 32 | ValueError, transform.transform, [y], 'log_log_sq', 33 | direction='inverse') 34 | self.assertRaises( 35 | ValueError, transform.transform, [y], 'log_log_sqrt', 36 | direction='dir') 37 | 38 | def test_normalize(self): 39 | x = np.linspace(0, 10, 1100) 40 | y = x**2 -30 41 | 42 | y_norm = transform.normalize([y], 'total_intensity', x_data=x) 43 | self.assertAlmostEqual(np.trapezoid(y_norm, x=x, axis=1)[0], 1) 44 | 45 | y_norm_2 = transform.normalize([y], 'total_intensity', x_data=x, 46 | factor=3.25) 47 | self.assertAlmostEqual(np.trapezoid(y_norm_2, x=x, axis=1)[0], 3.25) 48 | 49 | # test errors 50 | self.assertRaises(ValueError, transform.normalize, [y], 'tot_int') 51 | --------------------------------------------------------------------------------