├── bridgescaler
    ├── VERSION
    ├── __init__.py
    ├── tests
    │   ├── backend_test.py
    │   ├── deep_test.py
    │   ├── group_test.py
    │   ├── distributed_tensor_test.py
    │   └── distributed_test.py
    ├── deep.py
    ├── backend.py
    ├── group.py
    ├── distributed_tensor.py
    └── distributed.py
├── setup.py
├── MANIFEST.in
├── requirements.txt
├── doc
    ├── source
    │   ├── _static
    │   │   ├── logo.graffle
    │   │   └── bridgescaler_logo.png
    │   ├── modules.rst
    │   ├── gettingstarted.rst
    │   ├── index.rst
    │   ├── bridgescaler.rst
    │   ├── group.rst
    │   ├── conf.py
    │   ├── usage.rst
    │   └── distributed.rst
    ├── Makefile
    └── make.bat
├── environment.yml
├── scripts
    ├── numpy_mem.py
    ├── scipy_ppf_example.py
    └── eval_scaler.py
├── .readthedocs.yaml
├── setup.cfg
├── LICENSE
├── .github
    └── workflows
    │   ├── python-publish.yml
    │   └── python-package-conda.yml
├── .gitignore
├── README.md
└── notebooks
    └── Bridgscaler_intro.ipynb


/bridgescaler/VERSION:
--------------------------------------------------------------------------------
1 | 0.8.0
2 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | if __name__ == "__main__":
4 |     setup()


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include bridgescaler/*.py
3 | include bridgescaler/VERSION
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | numpy<2.0
3 | scikit-learn>=1.0
4 | crick
5 | xarray
6 | scipy>=1.11.0
7 | numba
8 | 


--------------------------------------------------------------------------------
/doc/source/_static/logo.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NCAR/bridgescaler/main/doc/source/_static/logo.graffle


--------------------------------------------------------------------------------
/doc/source/modules.rst:
--------------------------------------------------------------------------------
1 | bridgescaler
2 | ============
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    bridgescaler
8 | 


--------------------------------------------------------------------------------
/doc/source/_static/bridgescaler_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NCAR/bridgescaler/main/doc/source/_static/bridgescaler_logo.png


--------------------------------------------------------------------------------
/bridgescaler/__init__.py:
--------------------------------------------------------------------------------
1 | from .backend import save_scaler, load_scaler, print_scaler, read_scaler
2 | from .group import GroupStandardScaler, GroupRobustScaler, GroupMinMaxScaler
3 | from .deep import DeepStandardScaler, DeepMinMaxScaler, DeepQuantileTransformer
4 | from .distributed import DStandardScaler, DMinMaxScaler, DQuantileScaler
5 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: bridgescaler
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.11
 6 |   - pip
 7 |   - numpy<2
 8 |   - scipy>=1.11.0
 9 |   - pandas
10 |   - scikit-learn
11 |   - pyarrow
12 |   - pytest
13 |   - sphinx
14 |   - xarray
15 |   - crick
16 |   - numba
17 |   - sphinx-book-theme
18 |   - pip:
19 |     - .
20 | 


--------------------------------------------------------------------------------
/scripts/numpy_mem.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | matplotlib.use('agg')
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import psutil
 6 | import xarray as xr
 7 | mem = []
 8 | def get_data():
 9 |     return np.zeros((1000, 50, 50), dtype=np.float32)
10 | data = get_data()
11 | for i in range(data.shape[0]):
12 |     data[i] = np.random.random((50, 50))
13 |     mem.append(psutil.virtual_memory()[1])
14 | mem.append(psutil.virtual_memory()[1])
15 | xd = xr.DataArray(data)
16 | mem.append(psutil.virtual_memory()[1])
17 | plt.plot(mem)
18 | plt.savefig("mem_profile.png", dpi=200, bbox_inches="tight")
19 | 


--------------------------------------------------------------------------------
/doc/source/gettingstarted.rst:
--------------------------------------------------------------------------------
 1 | .. title:: Getting Started
 2 | 
 3 | .. gettingstarted:
 4 | 
 5 | Getting Started
 6 | ===============
 7 | 
 8 | Dependencies
 9 | ------------
10 | * scikit-learn
11 | * numpy
12 | * pandas
13 | * xarray
14 | * pydigest
15 | 
16 | Installation
17 | ------------
18 | For a stable version of bridgescaler, you can install from PyPI.
19 | 
20 | .. code-block:: bash
21 | 
22 |     pip install bridgescaler
23 | 
24 | For the latest version of bridgescaler, install from github.
25 | 
26 | .. code-block:: bash
27 | 
28 |     git clone https://github.com/NCAR/bridgescaler.git
29 |     cd bridgescaler
30 |     pip install .
31 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. bridgescaler documentation master file, created by
 2 |    sphinx-quickstart on Wed Feb  7 10:59:45 2024.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Bridgescaler
 7 | ========================================
 8 | Bridgescaler is a library to support reproducible and
 9 | distributed scaling of data for pre-processing of AI and ML models.
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 |    :caption: Contents:
14 | 
15 |    gettingstarted.rst
16 |    usage.rst
17 |    distributed.rst
18 |    group.rst
19 |    modules.rst
20 | 
21 | 
22 | 
23 | Indices and tables
24 | ==================
25 | 
26 | * :ref:`genindex`
27 | * :ref:`modindex`
28 | * :ref:`search`
29 | 


--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/doc/source/bridgescaler.rst:
--------------------------------------------------------------------------------
 1 | bridgescaler package
 2 | ====================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | bridgescaler.backend module
 8 | ---------------------------
 9 | 
10 | .. automodule:: bridgescaler.backend
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | bridgescaler.deep module
16 | ------------------------
17 | 
18 | .. automodule:: bridgescaler.deep
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | bridgescaler.distributed module
24 | -------------------------------
25 | 
26 | .. automodule:: bridgescaler.distributed
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | bridgescaler.group module
32 | -------------------------
33 | 
34 | .. automodule:: bridgescaler.group
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
39 | Module contents
40 | ---------------
41 | 
42 | .. automodule:: bridgescaler
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the OS, Python version and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "mambaforge-22.9"
13 |     # You can also specify other tool versions:
14 |     # nodejs: "19"
15 |     # rust: "1.64"
16 |     # golang: "1.19"
17 | 
18 | conda:
19 |   environment: environment.yml
20 | # Build documentation in the "docs/" directory with Sphinx
21 | sphinx:
22 |   configuration: doc/source/conf.py
23 | 
24 | # Optionally build your docs in additional formats such as PDF and ePub
25 | # formats:
26 | #    - pdf
27 | #    - epub
28 | 
29 | # Optional but recommended, declare the Python requirements required
30 | # to build your documentation
31 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
32 | # python:
33 | #    install:
34 | #    - requirements: docs/requirements.txt


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = bridgescaler
 3 | description = Tool to automagically save scikit-learn scaler properties to a portable, readable format.
 4 | long_description = file: README.md
 5 | long_description_content_type = text/markdown
 6 | author = David John Gagne
 7 | author_email = dgagne@ucar.edu
 8 | version = file: bridgescaler/VERSION
 9 | license = MIT
10 | license_file = LICENSE
11 | platform = any
12 | keywords = machine learning
13 | classifiers =
14 |     Programming Language :: Python
15 |     Programming Language :: Python :: 3.8
16 |     Programming Language :: Python :: 3.9
17 |     Programming Language :: Python :: 3.10
18 |     Programming Language :: Python :: 3.11
19 | url = https://github.com/NCAR/bridgescaler
20 | 
21 | [options]
22 | zip_safe = True
23 | packages = find:
24 | include_package_data = True
25 | setup_requires = setuptools
26 | python_requires = >=3.7
27 | install_requires =
28 |     scikit-learn>=1.0
29 |     numpy
30 |     pandas
31 |     crick
32 |     scipy
33 |     xarray
34 |     numba
35 |     sphinx
36 |     sphinx-book-theme
37 | 


--------------------------------------------------------------------------------
/doc/source/group.rst:
--------------------------------------------------------------------------------
 1 | .. title:: Group Scalers
 2 | 
 3 | .. group:
 4 | 
 5 | Group Scalers
 6 | =============
 7 | 
 8 | The group scalers use the same scaling parameters for a group of similar
 9 | variables rather than scaling each column independently. This is useful for situations where variables are related,
10 | such as temperatures at different height levels.
11 | 
12 | Groups are specified as a list of column ids, which can be column names for pandas dataframes or column indices
13 | for numpy arrays.
14 | 
15 | For example:
16 | 
17 | .. code-block:: python
18 | 
19 |     from bridgescaler.group import GroupStandardScaler
20 |     import pandas as pd
21 |     import numpy as np
22 |     x_rand = np.random.random(size=(100, 5))
23 |     data = pd.DataFrame(data=x_rand,
24 |                         columns=["a", "b", "c", "d", "e"])
25 |     groups = [["a", "b"], ["c", "d"], "e"]
26 |     group_scaler = GroupStandardScaler()
27 |     x_transformed = group_scaler.fit_transform(data, groups=groups)
28 | 
29 | "a" and "b" are a single group and all values of both will be included when calculating the mean and standard
30 | deviation for that group.


--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | # -- Project information -----------------------------------------------------
 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 8 | 
 9 | project = 'bridgescaler'
10 | copyright = '2024, University Corporation for Atmopsheric Research'
11 | author = 'David John Gagne'
12 | release = '0.8.0'
13 | 
14 | # -- General configuration ---------------------------------------------------
15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
16 | 
17 | extensions = ['sphinx.ext.napoleon']
18 | 
19 | templates_path = ['_templates']
20 | exclude_patterns = []
21 | 
22 | 
23 | 
24 | # -- Options for HTML output -------------------------------------------------
25 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
26 | 
27 | html_theme = 'sphinx_book_theme'
28 | html_static_path = ['_static']
29 | html_logo = "_static/bridgescaler_logo.png"
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 University Corporation for Atmospheric Research
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v3
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: '3.x'
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         pip install build
33 |     - name: Build package
34 |       run: python -m build
35 |     - name: Publish package
36 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 |       with:
38 |         user: __token__
39 |         password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/scripts/scipy_ppf_example.py:
--------------------------------------------------------------------------------
 1 | from scipy.stats import norm
 2 | from scipy.special import ndtri
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import psutil
 6 | import gc
 7 | 
 8 | process = psutil.Process()
 9 | n_elements = 301
10 | mem_vals = np.zeros(n_elements)
11 | mem_vals[0] = process.memory_info().rss / 1e6
12 | for i in range(1, n_elements):
13 |     x = np.random.random(size=(100, 50, 50))
14 |     ppf_val = ndtri(x)
15 |     mem_vals[i] = process.memory_info().rss / 1e6
16 |     gc.collect()
17 | plt.plot(mem_vals[1:] - mem_vals[0], label="ndtri")
18 | mem_vals = np.zeros(n_elements)
19 | mem_vals[0] = process.memory_info().rss / 1e6
20 | 
21 | for i in range(1, n_elements):
22 |     x = np.random.random(size=(100, 50, 50))
23 |     ppf_val = norm.ppf(x)
24 |     mem_vals[i] = process.memory_info().rss / 1e6
25 |     gc.collect()
26 | plt.plot(mem_vals[1:] - mem_vals[0], label="norm.ppf")
27 | mem_vals = np.zeros(n_elements)
28 | mem_vals[0] = process.memory_info().rss / 1e6
29 | for i in range(1, n_elements):
30 |     x = np.random.random(size=(100, 50, 50))
31 |     mem_vals[i] = process.memory_info().rss / 1e6
32 |     gc.collect()
33 | plt.plot(mem_vals[1:] - mem_vals[0], label="control")
34 | plt.xlabel("Iterations")
35 | plt.ylabel("Memory usage (MB)")
36 | plt.legend()
37 | plt.savefig("norm_usage_tracking.png", dpi=200, bbox_inches="tight")
38 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package-conda.yml:
--------------------------------------------------------------------------------
 1 | name: Python Package using Conda
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 |   workflow_dispatch:
 9 | 
10 | permissions:
11 |   actions: write
12 |   checks: write
13 |   contents: read
14 |   pull-requests: write
15 |   statuses: write
16 | 
17 | jobs:
18 |   build-linux:
19 |     runs-on: ubuntu-latest
20 |     strategy:
21 |       max-parallel: 5
22 |     defaults:
23 |       run:
24 |         shell: bash -l {0}
25 |     steps:
26 |       - uses: actions/checkout@v6
27 |       - name: Setup Python
28 |         uses: actions/setup-python@v6
29 |         with:
30 |           python-version: '3.11'
31 |           cache: 'pip'
32 |       - name: Install dependencies
33 |         run: | 
34 |           python -m pip install --upgrade uv
35 |           uv pip install torch --system --index-url https://download.pytorch.org/whl/cpu
36 |           uv pip install . --system 
37 |           uv pip install ruff pytest --system
38 |       - name: Lint with ruff
39 |         run: |
40 |           # stop the build if there are Python syntax errors or undefined names
41 |           ruff check --select=E9,F63,F7,F82 --exit-zero
42 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
43 |           ruff check --output-format concise --exit-zero
44 |           # Checking documentation errors
45 |           ruff check  --select=D  --exit-zero --statistics 
46 |       - name: Test with pytest
47 |         run: |
48 |           pytest
49 | 


--------------------------------------------------------------------------------
/doc/source/usage.rst:
--------------------------------------------------------------------------------
 1 | .. title:: Basic Usage
 2 | 
 3 | .. usage:
 4 | 
 5 | Basic Usage
 6 | ===========
 7 | bridgescaler supports all the common scikit-learn scaler classes:
 8 | 
 9 | * StandardScaler
10 | * RobustScaler
11 | * MinMaxScaler
12 | * MaxAbsScaler
13 | * QuantileTransformer
14 | * PowerTransformer
15 | * SplineTransformer
16 | 
17 | First, create some synthetic data to transform.
18 | 
19 | .. code-block:: python
20 | 
21 |     import numpy as np
22 |     import pandas as pd
23 | 
24 |     # specify distribution parameters for each variable
25 |     locs = np.array([0, 5, -2, 350.5], dtype=np.float32)
26 |     scales = np.array([1.0, 10, 0.1, 5000.0])
27 |     names = ["A", "B", "C", "D"]
28 |     num_examples = 205
29 |     x_data_dict = {}
30 |     for l in range(locs.shape[0]):
31 |         # sample from random normal with different parameters
32 |         x_data_dict[names[l]] = np.random.normal(loc=locs[l], scale=scales[l], size=num_examples)
33 |     x_data = pd.DataFrame(x_data_dict)
34 | 
35 | Now, let's fit and transform the data with StandardScaler.
36 | 
37 | .. code-block:: python
38 | 
39 |     from sklearn.preprocessing import StandardScaler
40 |     from bridgescaler import save_scaler, load_scaler
41 |     scaler = StandardScaler()
42 |     scaler.fit_transform(x_data)
43 |     filename = "x_standard_scaler.json"
44 |     # save to json file
45 |     save_scaler(scaler, filename)
46 |     # create new StandardScaler from json file information.
47 |     new_scaler = load_scaler(filename)
48 |     # new_scaler is a StandardScaler object


--------------------------------------------------------------------------------
/doc/source/distributed.rst:
--------------------------------------------------------------------------------
 1 | .. title:: Distributed Scalers
 2 | 
 3 | .. distributed:
 4 | 
 5 | Distributed Scalers
 6 | ===================
 7 | The distributed scalers allow you to calculate scaling
 8 | parameters on different subsets of a dataset and then combine the scaling factors
 9 | together to get representative scaling values for the full dataset. Distributed
10 | Standard Scalers, MinMax Scalers, and Quantile Transformers have been implemented and work with both tabular
11 | and muliti-dimensional patch data in numpy, pandas DataFrame, and xarray DataArray formats.
12 | 
13 | By default, the scaler assumes your channel/variable dimension is the last
14 | dimension, but if `channels_last=False` is set in the `__init__`, `transform`,
15 | or `inverse_transform` methods, then the 2nd dimension is assumed to be the variable
16 | dimension. It is possible to fit data with one ordering and then
17 | transform it with a different one.
18 | 
19 | For large datasets, it may be expensive to redo the scalers if you want to use a
20 | subset or different ordering of variables. However, in bridgescaler, the
21 | Distributed Scalers all support arbitrary ordering and subsets of variables for transforms if
22 | the input data are in a Xarray DataArray or Pandas DataFrame with variable
23 | names that match the original data.
24 | 
25 | Example:
26 | 
27 | .. code-block:: python
28 | 
29 |     from bridgescaler.distributed import DStandardScaler
30 |     import numpy as np
31 | 
32 |     x_1 = np.random.normal(0, 2.2, (20, 5, 4, 8))
33 |     x_2 = np.random.normal(1, 3.5, (25, 4, 8, 5))
34 | 
35 |     dss_1 = DStandardScaler(channels_last=False)
36 |     dss_2 = DStandardScaler(channels_last=True)
37 |     dss_1.fit(x_1)
38 |     dss_2.fit(x_2)
39 |     dss_combined = np.sum([dss_1, dss_2])
40 | 
41 |     dss_combined.transform(x_1, channels_last=False)
42 | 
43 | Distributed scalers can be stored in individual json files or within
44 | a pandas DataFrame for easy loading and combining later.
45 | 
46 | .. code-block:: python
47 | 
48 |     import pandas as pd
49 |     from bridgescaler import print_scaler, read_scaler
50 |     scaler_list = [dss_1, dss_2]
51 |     df = pd.DataFrame({"scalers": [print_scaler(s) for s in scaler_list]})
52 |     df.to_parquet("scalers.parquet")
53 |     df_new = pd.read_parquet("scalers.parquet")
54 |     scaler_objs = df_new["scalers"].apply(read_scaler)
55 |     total_scaler = scaler_objs.sum()
56 | 
57 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/bridgescaler/tests/backend_test.py:
--------------------------------------------------------------------------------
 1 | from bridgescaler import save_scaler, load_scaler, print_scaler, read_scaler
 2 | from bridgescaler.backend import create_synthetic_data
 3 | import numpy as np
 4 | import os
 5 | from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, QuantileTransformer
 6 | from bridgescaler.distributed import DStandardScaler, DMinMaxScaler, DQuantileScaler
 7 | from pandas import DataFrame
 8 | from os.path import exists
 9 | 
10 | 
11 | scaler_objs = {"StandardScaler": StandardScaler,
12 |                "MinMaxScaler": MinMaxScaler,
13 |                "RobustScaler": RobustScaler,
14 |                "QuantileTransformer": QuantileTransformer,
15 |                "DStandardScaler": DStandardScaler,
16 |                "DMinMaxScaler": DMinMaxScaler,
17 |                "DQuantileScaler": DQuantileScaler}
18 | 
19 | 
20 | def test_scaler_io():
21 |     try:
22 |         x_data = create_synthetic_data()
23 |         for scaler_name, scaler_obj in scaler_objs.items():
24 |             scaler = scaler_obj()
25 |             x_scaled_data = scaler.fit_transform(x_data)
26 |             save_scaler(scaler, "test.json")
27 |             assert exists("test.json")
28 |             loaded_scaler = load_scaler("test.json")
29 |             assert type(loaded_scaler) is type(scaler), "Type mismatch"
30 |             loaded_scaled_data = loaded_scaler.transform(x_data)
31 |             if type(x_scaled_data) is DataFrame:
32 |                 transform_diff = np.max(np.abs(x_scaled_data.values - loaded_scaled_data.values))
33 |             else:
34 |                 transform_diff = np.max(np.abs(x_scaled_data - loaded_scaled_data))
35 | 
36 |             assert transform_diff < np.finfo(np.float32).eps, scaler_name + " transform does not match"
37 |     finally:
38 |         if exists("test.json"):
39 |             os.remove("test.json")
40 |     return
41 | 
42 | 
43 | def test_scaler_str():
44 |     x_data = create_synthetic_data()
45 |     for scaler_name, scaler_obj in scaler_objs.items():
46 |         scaler = scaler_obj()
47 |         x_scaled_data = scaler.fit_transform(x_data)
48 |         scaler_str = print_scaler(scaler)
49 |         loaded_scaler = read_scaler(scaler_str)
50 |         assert type(loaded_scaler) is type(scaler), "Type Mismatch"
51 |         loaded_scaled_data = loaded_scaler.transform(x_data)
52 |         if type(x_scaled_data) is DataFrame:
53 |             transform_diff = np.max(np.abs(x_scaled_data.values - loaded_scaled_data.values))
54 |         else:
55 |             transform_diff = np.max(np.abs(x_scaled_data - loaded_scaled_data))
56 |         assert transform_diff < np.finfo(np.float32).eps, scaler_name + " transform does not match"
57 | 


--------------------------------------------------------------------------------
/bridgescaler/tests/deep_test.py:
--------------------------------------------------------------------------------
 1 | from bridgescaler.deep import DeepStandardScaler, DeepMinMaxScaler, DeepQuantileTransformer
 2 | from sklearn.preprocessing import QuantileTransformer
 3 | from bridgescaler import save_scaler, load_scaler
 4 | import numpy as np
 5 | from os.path import exists
 6 | import os
 7 | 
 8 | 
 9 | def test_deep_standard_scaler():
10 |     save_filename = "deep_standard.json"
11 |     try:
12 |         np.random.seed(352680)
13 | 
14 |         n_ex = 5000
15 |         n_channels = 4
16 |         dim = 32
17 |         means = np.array([1, 5, -4, 2.5], dtype=np.float32)
18 |         sds = np.array([10, 2, 43.4, 32.], dtype=np.float32)
19 |         x = np.zeros((n_ex, dim, dim, n_channels), dtype=np.float32)
20 |         for chan in range(n_channels):
21 |             x[..., chan] = np.random.normal(means[chan], sds[chan], (n_ex, dim, dim))
22 |         dss = DeepStandardScaler()
23 |         dss.fit(x)
24 |         x_transformed = dss.transform(x)
25 |         x_telephone = dss.inverse_transform(x_transformed)
26 |         assert x_transformed.shape == x.shape, "Shape mismatch"
27 |         assert np.mean(np.abs(x_telephone - x)) < 10 * np.finfo(np.float32).eps, "Significant differences"
28 |         save_scaler(dss, save_filename)
29 |         reloaded_scaler = load_scaler(save_filename)
30 |         x_t_r = reloaded_scaler.transform(x)
31 |         assert np.all(x_transformed == x_t_r), "Scaler reloads properly"
32 |     finally:
33 |         if exists(save_filename):
34 |             os.remove(save_filename)
35 |     return
36 | 
37 | 
38 | def test_deep_minmax_scaler():
39 |     np.random.seed(352680)
40 |     n_ex = 5000
41 |     n_channels = 4
42 |     dim = 32
43 |     means = np.array([1, 5, -4, 2.5], dtype=np.float32)
44 |     sds = np.array([10, 2, 43.4, 32.], dtype=np.float32)
45 |     x = np.zeros((n_ex, dim, dim, n_channels), dtype=np.float32)
46 |     for chan in range(n_channels):
47 |         x[..., chan] = np.random.normal(means[chan], sds[chan], (n_ex, dim, dim))
48 |     dss = DeepMinMaxScaler()
49 |     dss.fit(x)
50 |     x_transformed = dss.transform(x)
51 |     x_telephone = dss.inverse_transform(x_transformed)
52 |     assert x_transformed.shape == x.shape, "Shape mismatch"
53 |     assert x_transformed.max() <= 1, "Max greater than 1"
54 |     assert x_transformed.min() >= 0, "Min less than 0"
55 |     assert np.mean(np.abs(x_telephone - x)) < 50 * np.finfo(np.float32).eps, "Significant differences"
56 |     return
57 | 
58 | 
59 | def test_deep_quantile_transformer():
60 |     np.random.seed(352680)
61 |     n_ex = 1000
62 |     n_channels = 3
63 |     dim = 16
64 |     means = np.array([1, 5, -4, 2.5], dtype=np.float64)
65 |     sds = np.array([10, 2, 43.4, 32.], dtype=np.float64)
66 |     x = np.zeros((n_ex, dim, dim, n_channels), dtype=np.float64)
67 |     for chan in range(n_channels):
68 |         x[..., chan] = np.random.normal(means[chan], sds[chan], (n_ex, dim, dim))
69 |     dqs = DeepQuantileTransformer(n_quantiles=1000, stochastic=True)
70 |     dqs.fit(x)
71 |     x_transformed = dqs.transform(x)
72 |     x_telephone = dqs.inverse_transform(x_transformed)
73 |     reg_qs = QuantileTransformer(n_quantiles=1000, subsample=dim * dim * n_ex)
74 |     def flatten_to_2D(X):
75 |         return np.reshape(X, newshape=(X.shape[0] * X.shape[1] * X.shape[2], X.shape[-1]))
76 | 
77 |     x_flat = flatten_to_2D(x)
78 |     x_scaled = reg_qs.fit_transform(x_flat)
79 |     x_tel_2 = np.reshape(reg_qs.inverse_transform(x_scaled), newshape=(x.shape[0], x.shape[1], x.shape[2], x.shape[3]))
80 |     full_diff = np.abs(x - x_telephone).ravel()
81 |     reg_diff = np.abs(x_tel_2 - x).ravel()
82 |     assert x_transformed.shape == x.shape, "Shape mismatch"
83 |     assert x_transformed.max() <= 1, "Max greater than 1"
84 |     assert x_transformed.min() >= 0, "Min less than 0"
85 |     assert np.max(np.abs(full_diff - reg_diff)) < 1e-8, "significant differences in differences."
86 |     assert np.max(np.abs(x_telephone - x)) < 1e-8, "Significant differences"
87 |     return


--------------------------------------------------------------------------------
/bridgescaler/tests/group_test.py:
--------------------------------------------------------------------------------
 1 | from bridgescaler.group import GroupStandardScaler, GroupMinMaxScaler, GroupRobustScaler
 2 | from bridgescaler.backend import create_synthetic_data
 3 | from bridgescaler import save_scaler, load_scaler
 4 | import numpy as np
 5 | import pandas as pd
 6 | from os.path import exists
 7 | import os
 8 | 
 9 | 
10 | def test_group_standard_scaler():
11 |     try:
12 |         x_data = create_synthetic_data()
13 |         x_data_numpy = x_data.values
14 |         groups = [["A", "B"], "C", "D"]
15 |         n_groups = [[0, 1], 2, 3]
16 |         save_filename = "group_test.json"
17 |         group_scaler_n = GroupStandardScaler()
18 |         n_transformed = group_scaler_n.fit_transform(x_data_numpy, n_groups)
19 |         n_inv_transformed = group_scaler_n.inverse_transform(n_transformed)
20 |         assert np.max(np.abs(n_inv_transformed - x_data_numpy)) < np.finfo(np.float32).eps
21 |         group_scaler = GroupStandardScaler()
22 |         transformed_x = group_scaler.fit_transform(x_data, groups)
23 |         assert transformed_x.shape == x_data.shape
24 |         inverse_x = group_scaler.inverse_transform(transformed_x)
25 |         assert inverse_x.shape == x_data.shape
26 |         assert np.max(np.abs(inverse_x.values - x_data.values)) < np.finfo(np.float32).eps
27 |         save_scaler(group_scaler, save_filename)
28 |         reloaded_scaler = load_scaler(save_filename)
29 |         reloaded_scale_x = reloaded_scaler.transform(x_data)
30 |         assert np.all(transformed_x == reloaded_scale_x)
31 |     finally:
32 |         if exists("group_test.json"):
33 |             os.remove("group_test.json")
34 |     return
35 | 
36 | def test_group_minmax_scaler():
37 |     try:
38 |         x_data = create_synthetic_data()
39 |         x_data_numpy = x_data.values
40 |         groups = [["A", "B"], "C", "D"]
41 |         n_groups = [[0, 1], 2, 3]
42 |         save_filename = "group_test.json"
43 |         group_scaler_n = GroupMinMaxScaler()
44 |         n_transformed = group_scaler_n.fit_transform(x_data_numpy, n_groups)
45 |         n_inv_transformed = group_scaler_n.inverse_transform(n_transformed)
46 |         assert np.max(np.abs(n_inv_transformed - x_data_numpy)) < np.finfo(np.float32).eps
47 |         group_scaler = GroupMinMaxScaler()
48 |         transformed_x = group_scaler.fit_transform(x_data, groups)
49 |         assert transformed_x.shape == x_data.shape
50 |         inverse_x = group_scaler.inverse_transform(transformed_x)
51 |         assert inverse_x.shape == x_data.shape
52 |         assert np.max(np.abs(inverse_x.values - x_data.values)) < np.finfo(np.float32).eps
53 |         save_scaler(group_scaler, save_filename)
54 |         reloaded_scaler = load_scaler(save_filename)
55 |         reloaded_scale_x = reloaded_scaler.transform(x_data)
56 |         assert np.all(transformed_x == reloaded_scale_x)
57 |     finally:
58 |         if exists("group_test.json"):
59 |             os.remove("group_test.json")
60 |     return
61 | 
62 | 
63 | def test_group_robust_scaler():
64 |     try:
65 |         x_data = create_synthetic_data()
66 |         x_data_numpy = x_data.values
67 |         groups = [["A", "B"], "C", "D"]
68 |         n_groups = [[0, 1], 2, 3]
69 |         save_filename = "group_test.json"
70 |         group_scaler_n = GroupRobustScaler()
71 |         n_transformed = group_scaler_n.fit_transform(x_data_numpy, n_groups)
72 |         n_inv_transformed = group_scaler_n.inverse_transform(n_transformed)
73 |         assert np.max(np.abs(n_inv_transformed - x_data_numpy)) < np.finfo(np.float32).eps
74 |         group_scaler = GroupRobustScaler()
75 |         transformed_x = group_scaler.fit_transform(x_data, groups)
76 |         assert transformed_x.shape == x_data.shape
77 |         inverse_x = group_scaler.inverse_transform(transformed_x)
78 |         assert inverse_x.shape == x_data.shape
79 |         assert np.max(np.abs(inverse_x.values - x_data.values)) < np.finfo(np.float32).eps
80 |         save_scaler(group_scaler, save_filename)
81 |         reloaded_scaler = load_scaler(save_filename)
82 |         reloaded_scale_x = reloaded_scaler.transform(x_data)
83 |         assert np.all(transformed_x == reloaded_scale_x)
84 |     finally:
85 |         if exists("group_test.json"):
86 |             os.remove("group_test.json")
87 |     return


--------------------------------------------------------------------------------
/bridgescaler/deep.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class DeepStandardScaler(object):
  5 |     """
  6 |     Calculate standard scaler scores on an arbitrarily dimensional dataset as long as the last dimension is
  7 |     the variable dimension.
  8 | 
  9 |     """
 10 |     def __init__(self):
 11 |         self.mean_ = None
 12 |         self.sd_ = None
 13 |         return
 14 | 
 15 |     def fit(self, x):
 16 |         self.mean_ = np.zeros(x.shape[-1], dtype=x.dtype)
 17 |         self.sd_ = np.zeros(x.shape[-1], dtype=x.dtype)
 18 |         for v in range(x.shape[-1]):
 19 |             self.mean_[v] = np.mean(x[..., v])
 20 |             self.sd_[v] = np.std(x[..., v], ddof=1)
 21 | 
 22 |     def transform(self, x):
 23 |         x_transformed = np.zeros(x.shape, dtype=x.dtype)
 24 |         for v in range(x.shape[-1]):
 25 |             x_transformed[..., v] = (x[..., v] - self.mean_[v]) / self.sd_[v]
 26 |         return x_transformed
 27 | 
 28 |     def fit_transform(self, x):
 29 |         self.fit(x)
 30 |         return self.transform(x)
 31 | 
 32 |     def inverse_transform(self, x):
 33 |         x_inverse = np.zeros(x.shape, dtype=x.dtype)
 34 |         for v in range(x.shape[-1]):
 35 |             x_inverse[..., v] = x[..., v] * self.sd_[v] + self.mean_[v]
 36 |         return x_inverse
 37 | 
 38 | 
 39 | class DeepMinMaxScaler(object):
 40 |     def __init__(self):
 41 |         self.max_ = None
 42 |         self.min_ = None
 43 |         return
 44 | 
 45 |     def fit(self, x):
 46 |         self.max_ = np.zeros(x.shape[-1], dtype=x.dtype)
 47 |         self.min_ = np.zeros(x.shape[-1], dtype=x.dtype)
 48 |         for v in range(x.shape[-1]):
 49 |             self.max_[v] = np.max(x[..., v])
 50 |             self.min_[v] = np.min(x[..., v])
 51 | 
 52 |     def transform(self, x):
 53 |         x_transformed = np.zeros(x.shape, dtype=x.dtype)
 54 |         for v in range(x.shape[-1]):
 55 |             x_transformed[..., v] = (x[..., v] - self.min_[v]) / (self.max_[v] - self.min_[v])
 56 |         return x_transformed
 57 | 
 58 |     def fit_transform(self, x):
 59 |         self.fit(x)
 60 |         return self.transform(x)
 61 | 
 62 |     def inverse_transform(self, x):
 63 |         x_inverse = np.zeros(x.shape, dtype=x.dtype)
 64 |         for v in range(x.shape[-1]):
 65 |             x_inverse[..., v] = x[..., v] * (self.max_[v] - self.min_[v]) + self.min_[v]
 66 |         return x_inverse
 67 | 
 68 | 
 69 | class DeepQuantileTransformer(object):
 70 |     """
 71 |     Performs a quantile transform on N-dimensional arrays where the variable dimension is the last one.
 72 | 
 73 |     Attributes:
 74 |         n_quantiles: number of quantiles to calculate and store
 75 |         stochastic: When transforming to quantile space, whether to take the mean of the left and right interpolation values (False)
 76 |             or to pick a random point in between (True).
 77 |     """
 78 |     def __init__(self, n_quantiles=1000, stochastic=False):
 79 |         self.n_quantiles = n_quantiles
 80 |         self.stochastic = stochastic
 81 |         self.quantiles_ = None
 82 |         self.references_ = None
 83 |         self.fitted_ = False
 84 |         self.x_column_names_ = None
 85 | 
 86 |     def fit(self, x):
 87 |         if hasattr(x, "columns"):
 88 |             self.x_columns_ = x.columns
 89 |         else:
 90 |             self.x_columns_ = np.arange(x.shape[-1])
 91 |         self.quantiles_ = np.zeros((x.shape[-1], self.n_quantiles), dtype=x.dtype)
 92 |         self.references_ = np.linspace(0, 1, self.n_quantiles, endpoint=True)
 93 |         for v in range(x.shape[-1]):
 94 |             self.quantiles_[v] = np.nanquantile(x[..., v].ravel(), self.references_)
 95 |             self.quantiles_[v] = np.maximum.accumulate(self.quantiles_[v])
 96 |         return
 97 | 
 98 |     def transform(self, x):
 99 |         x_transformed = np.zeros(x.shape, dtype=x.dtype)
100 |         for v in range(x.shape[-1]):
101 |             x_transformed[..., v] = self._transform_col(x[..., v].ravel(), v).reshape(x[..., v].shape)
102 |         return x_transformed
103 | 
104 |     def fit_transform(self, x):
105 |         self.fit(x)
106 |         return self.transform(x)
107 | 
108 |     def inverse_transform(self, x):
109 |         x_transformed = np.zeros(x.shape, dtype=x.dtype)
110 |         for v in range(x.shape[-1]):
111 |             x_transformed[..., v] = self._inverse_transform_col(x[..., v].ravel(), v).reshape(x[..., v].shape)
112 |         return x_transformed
113 | 
114 |     def _transform_col(self, x_col, col_index):
115 |         left_ref = np.interp(x_col, self.quantiles_[col_index], self.references_)
116 |         right_ref = -np.interp(-x_col, -self.quantiles_[col_index][::-1], -self.references_[::-1])
117 |         p = 0.5
118 |         if self.stochastic:
119 |             p = np.random.uniform(0, 1, x_col.size)
120 |         return p * left_ref + (1 - p) * right_ref
121 | 
122 |     def _inverse_transform_col(self, x_col, col_index):
123 |         transformed_col = np.interp(x_col, self.references_, self.quantiles_[col_index])
124 |         return transformed_col
125 | 


--------------------------------------------------------------------------------
/bridgescaler/tests/distributed_tensor_test.py:
--------------------------------------------------------------------------------
 1 | # from bridgescaler import save_scaler, load_scaler, print_scaler, read_scaler
 2 | from bridgescaler.distributed_tensor import DStandardScalerTensor, DMinMaxScalerTensor
 3 | import numpy as np
 4 | import torch
 5 | import os
 6 | 
 7 | def make_test_data():
 8 |     np.random.seed(34325)
 9 |     test_data = dict()
10 |     col_names = ["a", "b", "c", "d", "e"]
11 |     test_data["means"] = np.array([0, 5.3, -2.421, 21456.3, 1.e-5])
12 |     test_data["sds"] = np.array([5, 352.2, 1e-4, 20000.3, 5.3e-2])
13 |     test_data["n_examples"] = np.array([1000, 500, 88])
14 |     test_data["numpy_2d"] = []
15 |     test_data["numpy_4d"] = []
16 |     test_data["pandas"] = []
17 |     test_data["xarray"] = []
18 |     tile_width = 5
19 |     for n in range(test_data["n_examples"].size):
20 |         data2d = np.zeros((test_data["n_examples"][n], test_data["means"].size))
21 |         data4d = np.zeros((test_data["n_examples"][n], tile_width, tile_width, test_data["means"].size))
22 |         for i in range(test_data["means"].size):
23 |             data2d[:, i] = np.random.normal(loc=test_data["means"][i],
24 |                                             scale=test_data["sds"][i],
25 |                                             size=test_data["n_examples"][n])
26 |             data4d[..., i] = np.random.normal(loc=test_data["means"][i],
27 |                                               scale=test_data["sds"][i],
28 |                                               size=(test_data["n_examples"][n], tile_width, tile_width))
29 |         test_data["numpy_2d"].append(data2d)
30 |         test_data["numpy_4d"].append(data4d)
31 | 
32 |     return test_data
33 | 
34 | 
35 | # Create test datasets for use in all unit tests.
36 | test_data = make_test_data()
37 | 
38 | def test_dstandard_tensor_scaler():
39 |     numpy_2d_1 = torch.from_numpy(test_data["numpy_2d"][0])
40 |     numpy_2d_2 = torch.from_numpy(test_data["numpy_2d"][1])
41 |     numpy_2d_3 = torch.from_numpy(test_data["numpy_2d"][2])
42 |     all_ds_2d = torch.vstack([numpy_2d_1, numpy_2d_2, numpy_2d_3])
43 |     numpy_4d_1 = torch.from_numpy(test_data["numpy_4d"][0])
44 |     numpy_4d_2 = torch.from_numpy(test_data["numpy_4d"][1])
45 |     numpy_4d_3 = torch.from_numpy(test_data["numpy_4d"][2])
46 |     all_ds_4d = torch.vstack([numpy_4d_1, numpy_4d_2, numpy_4d_3])
47 |     dsses_2d = []
48 |     dsses_4d = []
49 |     for n in range(test_data["n_examples"].size):
50 |         dsses_2d.append(DStandardScalerTensor())
51 |         dsses_2d[-1].fit(torch.from_numpy(test_data["numpy_2d"][n]))
52 |         dsses_4d.append(DStandardScalerTensor(channels_last=True))
53 |         dsses_4d[-1].fit(torch.from_numpy(test_data["numpy_4d"][n]))
54 |         # save_scaler(dsses_2d[-1], "scaler.json")
55 |         # new_scaler = load_scaler("scaler.json")
56 |         # os.remove("scaler.json")
57 |     dss_total_2d = dsses_2d[0] + dsses_2d[1] + dsses_2d[2]
58 |     dss_total_4d = dsses_4d[0] + dsses_4d[1] + dsses_4d[2]
59 |     mean_2d, var_2d = dss_total_2d.get_scales()
60 |     mean_4d, var_4d = dss_total_4d.get_scales()
61 |     all_2d_var = all_ds_2d.var(axis=0, unbiased=False)
62 |     all_4d_var = torch.tensor([all_ds_4d[..., i].var(unbiased=False) for i in range(all_ds_4d.shape[-1])])
63 |     all_4d_mean = torch.tensor([all_ds_4d[..., i].mean() for i in range(all_ds_4d.shape[-1])])
64 |     assert mean_2d.shape[0] == test_data["means"].shape[0] and var_2d.shape[0] == test_data["sds"].shape[0], "Stat shape mismatch"
65 |     assert mean_4d.shape[0] == test_data["means"].shape[0] and var_4d.shape[0] == test_data["sds"].shape[0], "Stat shape mismatch"
66 |     assert torch.max(torch.abs(mean_2d - all_ds_2d.mean(axis=0))) < 1e-5, "significant difference in means"
67 |     assert torch.max(torch.abs(var_2d - all_2d_var) / all_2d_var) < 1e-5, "significant difference in variances"
68 |     assert torch.max(torch.abs(mean_4d - all_4d_mean) / all_4d_mean) < 1e-5, "significant difference in means"
69 |     assert torch.max(torch.abs(var_4d - all_4d_var) / all_4d_var) < 1e-5, "significant difference in variances"
70 | 
71 | 
72 | def test_dminmax_tensor_scaler():
73 |     numpy_2d_1 = torch.from_numpy(test_data["numpy_2d"][0])
74 |     numpy_2d_2 = torch.from_numpy(test_data["numpy_2d"][1])
75 |     numpy_2d_3 = torch.from_numpy(test_data["numpy_2d"][2])
76 |     all_ds_2d = torch.vstack([numpy_2d_1, numpy_2d_2, numpy_2d_3])
77 |     numpy_4d_1 = torch.from_numpy(test_data["numpy_4d"][0])
78 |     numpy_4d_2 = torch.from_numpy(test_data["numpy_4d"][1])
79 |     numpy_4d_3 = torch.from_numpy(test_data["numpy_4d"][2])
80 |     all_ds_4d = torch.vstack([numpy_4d_1, numpy_4d_2, numpy_4d_3])
81 |     dsses_2d = []
82 |     dsses_4d = []
83 |     for n in range(test_data["n_examples"].size):
84 |         dsses_2d.append(DMinMaxScalerTensor())
85 |         dsses_2d[-1].fit(torch.from_numpy(test_data["numpy_2d"][n]))
86 |         dsses_4d.append(DMinMaxScalerTensor())
87 |         dsses_4d[-1].fit(torch.from_numpy(test_data["numpy_4d"][n]))
88 |         #save_scaler(dsses_2d[-1], "scaler.json")
89 |         #new_scaler = load_scaler("scaler.json")
90 |         #os.remove("scaler.json")
91 |     dss_total_2d = dsses_2d[0] + dsses_2d[1] + dsses_2d[2]
92 |     dss_total_4d = dsses_4d[0] + dsses_4d[1] + dsses_4d[2]
93 |     min_2d, max_2d = dss_total_2d.get_scales()
94 |     min_4d, max_4d = dss_total_4d.get_scales()
95 |     assert torch.max(torch.abs(min_2d - all_ds_2d.min(axis=0).values)) < 1e-8, "significant difference in minimum"
96 |     assert torch.max(torch.abs(max_2d - all_ds_2d.max(axis=0).values)) < 1e-8, "significant difference in maximum"
97 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # bridgescaler
  2 | Bridge your scikit-learn-style scaler parameters between Python sessions and users.
  3 | Bridgescaler allows you to save the properties of a scikit-learn-style scaler object
  4 | to a json file, and then repopulate a new scaler object with the same properties.
  5 | 
  6 | 
  7 | ## Dependencies
  8 | * scikit-learn
  9 | * numpy
 10 | * pandas
 11 | * xarray
 12 | * pytdigest
 13 | 
 14 | ## Installation
 15 | For a stable version of bridgescaler, you can install from PyPI.
 16 | ```bash
 17 | pip install bridgescaler
 18 | ```
 19 | 
 20 | For the latest version of bridgescaler, install from github.
 21 | ```bash
 22 | git clone https://github.com/NCAR/bridgescaler.git
 23 | cd bridgescaler
 24 | pip install .
 25 | ```
 26 | 
 27 | ## Usage
 28 | bridgescaler supports all the common scikit-learn scaler classes:
 29 | * StandardScaler
 30 | * RobustScaler
 31 | * MinMaxScaler
 32 | * MaxAbsScaler
 33 | * QuantileTransformer
 34 | * PowerTransformer
 35 | * SplineTransformer
 36 | 
 37 | First, create some synthetic data to transform.
 38 | ```python
 39 | import numpy as np
 40 | import pandas as pd
 41 | 
 42 | # specify distribution parameters for each variable
 43 | locs = np.array([0, 5, -2, 350.5], dtype=np.float32)
 44 | scales = np.array([1.0, 10, 0.1, 5000.0])
 45 | names = ["A", "B", "C", "D"]
 46 | num_examples = 205
 47 | x_data_dict = {}
 48 | for l in range(locs.shape[0]):
 49 |     # sample from random normal with different parameters
 50 |     x_data_dict[names[l]] = np.random.normal(loc=locs[l], scale=scales[l], size=num_examples)
 51 | x_data = pd.DataFrame(x_data_dict)
 52 | ```
 53 | 
 54 | Now, let's fit and transform the data with StandardScaler.
 55 | ```python
 56 | from sklearn.preprocessing import StandardScaler
 57 | from bridgescaler import save_scaler, load_scaler
 58 | 
 59 | scaler = StandardScaler()
 60 | scaler.fit_transform(x_data)
 61 | filename = "x_standard_scaler.json"
 62 | # save to json file
 63 | save_scaler(scaler, filename)
 64 | 
 65 | # create new StandardScaler from json file information.
 66 | new_scaler = load_scaler(filename) # new_scaler is a StandardScaler object
 67 | ```
 68 | ### Distributed Scaler
 69 | The distributed scalers allow you to calculate scaling
 70 | parameters on different subsets of a dataset and then combine the scaling factors
 71 | together to get representative scaling values for the full dataset. Distributed
 72 | Standard Scalers, MinMax Scalers, and Quantile Transformers have been implemented and work with both tabular
 73 | and muliti-dimensional patch data in numpy, pandas DataFrame, and xarray DataArray formats.
 74 | By default, the scaler assumes your channel/variable dimension is the last
 75 | dimension, but if `channels_last=False` is set in the `__init__`, `transform`,
 76 | or `inverse_transform` methods, then the 2nd dimension is assumed to be the variable
 77 | dimension. It is possible to fit data with one ordering and then
 78 | transform it with a different one. 
 79 | 
 80 | For large datasets, it may be expensive to redo the scalers if you want to use a 
 81 | subset or different ordering of variables. However, in bridgescaler, the 
 82 | Distributed Scalers all support arbitrary ordering and subsets of variables for transforms if 
 83 | the input data are in a Xarray DataArray or Pandas DataFrame with variable
 84 | names that match the original data. 
 85 | 
 86 | Example:
 87 | ```python
 88 | from bridgescaler.distributed import DStandardScaler
 89 | import numpy as np
 90 | 
 91 | x_1 = np.random.normal(0, 2.2, (20, 5, 4, 8))
 92 | x_2 = np.random.normal(1, 3.5, (25, 4, 8, 5))
 93 | 
 94 | dss_1 = DStandardScaler(channels_last=False)
 95 | dss_2 = DStandardScaler(channels_last=True)
 96 | dss_1.fit(x_1)
 97 | dss_2.fit(x_2)
 98 | dss_combined = np.sum([dss_1, dss_2])
 99 | 
100 | dss_combined.transform(x_1, channels_last=False)
101 | ```
102 | 
103 | ### Group Scaler
104 | The group scalers use the same scaling parameters for a group of similar
105 | variables rather than scaling each column independently. This is useful for situations where variables are related, 
106 | such as temperatures at different height levels.
107 | 
108 | Groups are specified as a list of column ids, which can be column names for pandas dataframes or column indices
109 | for numpy arrays.
110 | 
111 | For example:
112 | ```python
113 | from bridgescaler.group import GroupStandardScaler
114 | import pandas as pd
115 | import numpy as np
116 | x_rand = np.random.random(size=(100, 5))
117 | data = pd.DataFrame(data=x_rand, 
118 |                     columns=["a", "b", "c", "d", "e"])
119 | groups = [["a", "b"], ["c", "d"], "e"]
120 | group_scaler = GroupStandardScaler()
121 | x_transformed = group_scaler.fit_transform(data, groups=groups)
122 | ```
123 | 
124 | "a" and "b" are a single group and all values of both will be included when calculating the mean and standard 
125 | deviation for that group.
126 | 
127 | ### Deep Scaler
128 | The deep scalers are designed to scale 2 or 3-dimensional fields input into a 
129 | deep learning model such as a convolutional neural network. The scalers assume
130 | that the last dimension is the channel/variable dimension and scales the values accordingly.
131 | The scalers can support 2D or 3D patches with no change in code structure. Support is provided for
132 | DeepStandardScaler and DeepQuantileTransformer.
133 | 
134 | Example:
135 | ```python
136 | from bridgescaler.deep import DeepStandardScaler
137 | import numpy as np
138 | np.random.seed(352680)
139 | n_ex = 5000
140 | n_channels = 4
141 | dim = 32
142 | means = np.array([1, 5, -4, 2.5], dtype=np.float32)
143 | sds = np.array([10, 2, 43.4, 32.], dtype=np.float32)
144 | x = np.zeros((n_ex, dim, dim, n_channels), dtype=np.float32)
145 | for chan in range(n_channels):
146 |     x[..., chan] = np.random.normal(means[chan], sds[chan], (n_ex, dim, dim))
147 | dss = DeepStandardScaler()
148 | dss.fit(x)
149 | x_transformed = dss.transform(x)
150 | ```
151 | 


--------------------------------------------------------------------------------
/bridgescaler/backend.py:
--------------------------------------------------------------------------------
  1 | from sklearn.preprocessing import (StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, QuantileTransformer,
  2 |                                    SplineTransformer, PowerTransformer)
  3 | from bridgescaler.group import GroupStandardScaler, GroupRobustScaler, GroupMinMaxScaler
  4 | from bridgescaler.deep import DeepStandardScaler, DeepMinMaxScaler, DeepQuantileTransformer
  5 | from bridgescaler.distributed import DStandardScaler, DMinMaxScaler, DQuantileScaler
  6 | import numpy as np
  7 | import json
  8 | import pandas as pd
  9 | from numpy.lib.format import descr_to_dtype, dtype_to_descr
 10 | from base64 import b64decode, b64encode
 11 | from typing import Any
 12 | 
 13 | 
 14 | scaler_objs = {"StandardScaler": StandardScaler,
 15 |                "MinMaxScaler": MinMaxScaler,
 16 |                "RobustScaler": RobustScaler,
 17 |                "MaxAbsScaler": MaxAbsScaler,
 18 |                "SplineTransformer": SplineTransformer,
 19 |                "PowerTransformer": PowerTransformer,
 20 |                "QuantileTransformer": QuantileTransformer,
 21 |                "GroupStandardScaler": GroupStandardScaler,
 22 |                "GroupRobustScaler": GroupRobustScaler,
 23 |                "GroupMinMaxScaler": GroupMinMaxScaler,
 24 |                "DeepStandardScaler": DeepStandardScaler,
 25 |                "DeepMinMaxScaler": DeepMinMaxScaler,
 26 |                "DeepQuantileTransformer": DeepQuantileTransformer,
 27 |                "DStandardScaler": DStandardScaler,
 28 |                "DMinMaxScaler": DMinMaxScaler,
 29 |                "DQuantileScaler": DQuantileScaler,
 30 |                }
 31 | 
 32 | 
 33 | def save_scaler(scaler, scaler_file):
 34 |     """
 35 |     Save a scikit-learn or bridgescaler scaler object to json format.
 36 | 
 37 |     Args:
 38 |         scaler: scikit-learn-style scaler object
 39 |         scaler_file: path to json file where scaler information is stored.
 40 |     """
 41 |     scaler_params = scaler.__dict__
 42 |     scaler_params["type"] = str(type(scaler))[1:-2].split(".")[-1]
 43 |     with open(scaler_file, "w") as file_obj:
 44 |         json.dump(scaler_params, file_obj, indent=4, sort_keys=True, cls=NumpyEncoder)
 45 |     return
 46 | 
 47 | 
 48 | def print_scaler(scaler):
 49 |     """
 50 |     Output scikit-learn or bridgescaler scaler object to json string.
 51 | 
 52 |     Args:
 53 |         scaler: scikit-learn-style scaler object
 54 | 
 55 |     Returns:
 56 |         str representation of object in json format
 57 |     """
 58 |     scaler_params = scaler.__dict__
 59 |     scaler_params["type"] = str(type(scaler))[1:-2].split(".")[-1]
 60 |     return json.dumps(scaler_params, indent=4, sort_keys=True, cls=NumpyEncoder)
 61 | 
 62 | 
 63 | def object_hook(dct: dict[Any, Any]):
 64 |     if "__numpy__" in dct:
 65 |         np_obj = np.frombuffer(
 66 |             b64decode(dct["__numpy__"]), descr_to_dtype(dct["dtype"])
 67 |         )
 68 |         return np_obj.reshape(shape) if (shape := dct["shape"]) else np_obj[0]
 69 |     return dct
 70 | 
 71 | def read_scaler(scaler_str):
 72 |     """
 73 |     Initialize scikit-learn or bridgescaler scaler from json str.
 74 | 
 75 |     Args:
 76 |         scaler_str: json str
 77 | 
 78 |     Returns:
 79 |         scaler object.
 80 |     """
 81 |     scaler_params = json.loads(scaler_str, object_hook=object_hook)
 82 |     scaler = scaler_objs[scaler_params["type"]]()
 83 |     del scaler_params["type"]
 84 |     for k, v in scaler_params.items():
 85 |         if isinstance(v, dict) and v["object"] == "ndarray":
 86 |             setattr(scaler, k, np.array(v['data'], dtype=v['dtype']).reshape(v['shape']))
 87 |         else:
 88 |             setattr(scaler, k, v)
 89 |     return scaler
 90 | 
 91 | 
 92 | def load_scaler(scaler_file):
 93 |     """
 94 |     Initialize scikit-learn or bridgescaler scaler from saved json file.
 95 | 
 96 |     Args:
 97 |         scaler_file: path to json file.
 98 | 
 99 |     Returns:
100 |         scaler object.
101 |     """
102 |     with open(scaler_file, "r") as file_obj:
103 |         scaler_str = file_obj.read()
104 |     return read_scaler(scaler_str)
105 | 
106 | 
107 | class NumpyEncoder(json.JSONEncoder):
108 |     """ Custom encoder for numpy data types """
109 | 
110 |     def default(self, obj):
111 |         if int(np.__version__.split('.')[0]) >= 2:
112 |             float_types = (np.float16, np.float32, np.float64)
113 |         else:
114 |             float_types = (np.float_, np.float16, np.float32, np.float64)
115 |       
116 |         if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
117 |                             np.int16, np.int32, np.int64, np.uint8,
118 |                             np.uint16, np.uint32, np.uint64)):
119 | 
120 |             return int(obj)
121 | 
122 |         elif isinstance(obj, float_types):
123 |             return float(obj)
124 | 
125 |         elif isinstance(obj, (np.complex64, np.complex128)):
126 |             return {'real': obj.real, 'imag': obj.imag}
127 | 
128 |         elif isinstance(obj, (np.ndarray,)) and obj.dtype == "|O":
129 |             return {'object': 'ndarray', 'dtype': obj.dtype.str, 'shape': list(obj.shape),
130 |                     'data': obj.ravel().tolist()}
131 | 
132 |         elif isinstance(obj, (np.ndarray, np.generic)):
133 |             return {
134 |                 "__numpy__": b64encode(
135 |                     obj.data if obj.flags.c_contiguous else obj.tobytes()
136 |                 ).decode(),
137 |                 "dtype": dtype_to_descr(obj.dtype),
138 |                 "shape": obj.shape,
139 |             }
140 | 
141 |         elif isinstance(obj, (np.bool_)):
142 |             return bool(obj)
143 | 
144 |         elif isinstance(obj, (np.void)):
145 |             return None
146 | 
147 |         return json.JSONEncoder.default(self, obj)
148 | 
149 | 
150 | def create_synthetic_data():
151 |     locs = np.array([0, 5, -2, 350.5], dtype=np.float32)
152 |     scales = np.array([1.0, 10, 0.1, 5000.0])
153 |     names = ["A", "B", "C", "D"]
154 |     num_examples = 205
155 |     x_data_dict = {}
156 |     for l in range(locs.shape[0]):
157 |         x_data_dict[names[l]] = np.random.normal(loc=locs[l], scale=scales[l], size=num_examples)
158 |     x_data = pd.DataFrame(x_data_dict)
159 |     return x_data
160 | 


--------------------------------------------------------------------------------
/scripts/eval_scaler.py:
--------------------------------------------------------------------------------
  1 | from bridgescaler.distributed import DStandardScaler, DMinMaxScaler, DQuantileTransformer, DQuantileScaler
  2 | from bridgescaler import save_scaler, load_scaler
  3 | import numpy as np
  4 | import pandas as pd
  5 | import xarray as xr
  6 | import os
  7 | from multiprocessing import Pool
  8 | import psutil
  9 | from scipy.special import ndtri
 10 | from scipy.stats import norm
 11 | from memory_profiler import profile
 12 | 
 13 | def make_test_data():
 14 |     np.random.seed(34325)
 15 |     test_data = dict()
 16 |     col_names = ["a", "b", "c", "d", "e"]
 17 |     test_data["means"] = np.array([0, 5.3, -2.421, 21456.3, 1.e-5])
 18 |     test_data["sds"] = np.array([5, 352.2, 1e-4, 20000.3, 5.3e-2])
 19 |     test_data["n_examples"] = np.array([100000, 500, 88])
 20 |     test_data["numpy_2d"] = []
 21 |     test_data["numpy_4d"] = []
 22 |     test_data["pandas"] = []
 23 |     test_data["xarray"] = []
 24 |     tile_width = 5
 25 |     for n in range(test_data["n_examples"].size):
 26 |         data2d = np.zeros((test_data["n_examples"][n], test_data["means"].size))
 27 |         data4d = np.zeros((test_data["n_examples"][n], tile_width, tile_width, test_data["means"].size))
 28 |         for i in range(test_data["means"].size):
 29 |             data2d[:, i] = np.random.normal(loc=test_data["means"][i],
 30 |                                             scale=test_data["sds"][i],
 31 |                                             size=test_data["n_examples"][n])
 32 |             data4d[..., i] = np.random.normal(loc=test_data["means"][i],
 33 |                                               scale=test_data["sds"][i],
 34 |                                               size=(test_data["n_examples"][n], tile_width, tile_width))
 35 |         test_data["numpy_2d"].append(data2d)
 36 |         test_data["numpy_4d"].append(data4d)
 37 |         test_data["pandas"].append(pd.DataFrame(data2d, columns=col_names, index=np.arange(data2d.shape[0])))
 38 |         test_data["xarray"].append(xr.DataArray(data4d,
 39 |                                                 dims=("batch", "y", "x", "variable"),
 40 |                                                 coords=dict(batch=np.arange(test_data["n_examples"][n]),
 41 |                                                             y=np.arange(tile_width),
 42 |                                                             x=np.arange(tile_width),
 43 |                                                             variable=col_names)))
 44 | 
 45 |     return test_data
 46 | 
 47 | def eval_dquantile_scaler(test_data):
 48 |     np.random.seed(536)
 49 |     dsses_2d = []
 50 |     dsses_4d = []
 51 |     #pool = None
 52 |     pool = Pool(8)
 53 |     for n in range(test_data["n_examples"].size):
 54 |         dsses_2d.append(DQuantileScaler())
 55 |         dsses_2d[-1].fit(test_data["numpy_2d"][n])
 56 |         dsses_4d.append(DQuantileScaler())
 57 |         dsses_4d[-1].fit(test_data["numpy_4d"][n])
 58 |         ds_2d_transformed = dsses_2d[-1].transform(test_data["numpy_2d"][n], pool=pool)
 59 |         ds_4d_transformed = dsses_4d[-1].transform(test_data["numpy_4d"][n], pool=pool)
 60 |         ds_2d_it = dsses_2d[-1].inverse_transform(ds_2d_transformed, pool=pool)
 61 |         ds_4d_it = dsses_4d[-1].inverse_transform(ds_4d_transformed, pool=pool)
 62 |         assert ds_2d_transformed.max() <= 1, "Quantile transform > 1"
 63 |         assert ds_4d_transformed.max() <= 1, "Quantile transform > 1"
 64 |         save_scaler(dsses_2d[-1], "scaler.json")
 65 |         new_scaler = load_scaler("scaler.json")
 66 |         os.remove("scaler.json")
 67 |         assert np.nanargmax(np.abs((new_scaler.min_ - dsses_2d[-1].min_))) == 0, \
 68 |             "Differences in scaler centroid values after loading"
 69 |     pd_dss = DQuantileScaler()
 70 |     pd_trans = pd_dss.fit_transform(test_data["pandas"][0], pool=pool)
 71 |     pd_inv_trans = pd_dss.inverse_transform(pd_trans, pool=pool)
 72 |     sub_cols = ["d", "b"]
 73 |     pd_sub_trans = pd_dss.transform(test_data["pandas"][0][sub_cols], pool=pool)
 74 |     assert pd_sub_trans.shape[1] == len(sub_cols), "Did not subset properly"
 75 |     pd_sub_inv_trans = pd_dss.inverse_transform(pd_sub_trans, pool=pool)
 76 |     assert pd_sub_inv_trans.shape[1] == len(sub_cols), "Did not subset properly on inverse."
 77 |     assert type(pd_trans) is type(test_data["pandas"][0]), "Pandas DataFrame type not passed through transform"
 78 |     assert type(pd_inv_trans) is type(test_data["pandas"][0]), "Pandas DataFrame type not passed through inverse"
 79 |     xr_dss = DQuantileScaler(distribution="normal")
 80 |     xr_trans = xr_dss.fit_transform(test_data["xarray"][0], pool=pool)
 81 |     xr_inv_trans = xr_dss.inverse_transform(xr_trans, pool=pool)
 82 |     assert np.all(~np.isnan(xr_trans)), "nans in transform"
 83 |     assert np.all(~np.isnan(xr_inv_trans)), "nans in inverse transform"
 84 |     assert xr_trans.shape == test_data["xarray"][0].shape, "shape does not match"
 85 |     assert xr_inv_trans.shape == test_data["xarray"][0].shape, "shape does not match"
 86 | 
 87 |     #assert np.max(np.abs(xr_inv_trans.values - test_data["xarray"][0].values)) < 1e-3, "Differences in transform"
 88 |     combined_scaler = np.sum(dsses_2d)
 89 |     assert combined_scaler.size_[0] == test_data["n_examples"].sum(), \
 90 |         "Summing did not work properly."
 91 |     test_data_c_first = test_data["xarray"][0].transpose("batch", "variable", "y", "x").astype("float32")
 92 |     xr_dss_first = xr_dss.transform(test_data_c_first, channels_last=False, pool=pool)
 93 |     xr_inv_dss_first = xr_dss.inverse_transform(xr_dss_first, channels_last=False, pool=pool)
 94 |     assert xr_dss_first.shape == xr_inv_dss_first.shape, "shape does not match"
 95 |     xr_dss_f = DQuantileScaler(distribution="normal", channels_last=False)
 96 |     xr_dss_f.fit(test_data_c_first)
 97 |     scaled_data_quantile_first = xr_dss_f.transform(test_data_c_first, pool=pool)
 98 |     assert scaled_data_quantile_first.shape == test_data_c_first.shape
 99 |     if pool is not None:
100 |         pool.close()
101 |         pool.join()
102 |     return
103 | 
104 | def small_eval(test_data):
105 |     process = psutil.Process()
106 | 
107 |     # Record initial memory usage
108 | 
109 |     test_data_c_first = test_data["xarray"][0].transpose("batch", "variable", "y", "x").astype("float32")
110 |     xr_dss_f = DQuantileScaler(distribution="normal", channels_last=False)
111 |     xr_dss_f.fit(test_data_c_first)
112 |     bt_memory = process.memory_info().rss
113 |     initial_memory = process.memory_info().rss
114 |     print(initial_memory/1e6)
115 |     xr_dss_f.distribution = None
116 |     test_data_c_first = xr_dss_f.transform(test_data_c_first)
117 |     test_data_c_sec = ndtri(test_data_c_first)
118 |     output_arr = np.full((1000, 50, 50), 0.5)
119 |     output_arr = norm.ppf(output_arr)
120 |     output_arr = np.full((1000, 50, 50), 0.5)
121 |     output_arr = ndtri(output_arr)
122 |     at_memory = process.memory_info().rss
123 |     print("final mem:", at_memory / 1e6)
124 | 
125 |     print("mem diff:", (at_memory - bt_memory) / 1e6)
126 |     return test_data_c_first
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     from time import  time
131 | 
132 |     start = time()
133 |     test_data = make_test_data()
134 |     test_data_c_first = test_data["xarray"][0].transpose("batch", "variable", "y", "x").astype("float32")
135 |     print(test_data["xarray"][0])
136 |     test_data_c_first[:] = small_eval(test_data)
137 |     #eval_dquantile_scaler(test_data)
138 |     stop = time()
139 |     print(stop - start)
140 | 


--------------------------------------------------------------------------------
/bridgescaler/group.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from copy import copy, deepcopy
  3 | 
  4 | 
  5 | class GroupBaseScaler(object):
  6 |     def __init__(self):
  7 |         self.groups_ = None
  8 |         self.group_index_ = None
  9 |         self.x_columns_ = None
 10 | 
 11 |     def extract_x_columns(self, x):
 12 |         """
 13 |         Extract the variable names to be transformed from x depending on if x is a pandas DataFrame, an
 14 |         xarray DataArray, or a numpy array. All of these assume that the columns are in the last dimension.
 15 |         If x is an xarray DataArray, there should be a coorindate variable with the same name as the last dimension
 16 |         of the DataArray being transformed.
 17 | 
 18 |         Args:
 19 |             x (Union[pandas.DataFrame, xarray.DataArray, numpy.ndarray]): array of values to be transformed.
 20 | 
 21 |         Returns:
 22 |             xv (numpy.ndarray): Array of values to be transformed.
 23 |         """
 24 |         if hasattr(x, "columns"):
 25 |             self.x_columns_ = x.columns
 26 |             xv = x.values
 27 |         elif hasattr(x, "coords"):
 28 |             var_dim = x.dims[-1]
 29 |             self.x_columns_ = x.coords[var_dim].values
 30 |             xv = x.values
 31 |         else:
 32 |             self.x_columns_ = np.arange(x.shape[-1])
 33 |             xv = x
 34 |         return xv
 35 | 
 36 |     @staticmethod
 37 |     def package_transformed_x(x_transformed, x):
 38 |         """
 39 |         Repackaged a transformed numpy array into the same datatype as the original x, including
 40 |         all metadata.
 41 | 
 42 |         Args:
 43 |             x_transformed (numpy.ndarray): array after being transformed or inverse transformed
 44 |             x (Union[pandas.DataFrame, xarray.DataArray, numpy.ndarray]):
 45 | 
 46 |         Returns:
 47 | 
 48 |         """
 49 |         if hasattr(x, "columns"):
 50 |             x_packaged = copy(x)
 51 |             x_packaged.loc[:, :] = x_transformed
 52 |         elif hasattr(x, "coords"):
 53 |             x_packaged = copy(x)
 54 |             x_packaged[:] = x_transformed
 55 |         else:
 56 |             x_packaged = x_transformed
 57 |         return x_packaged
 58 | 
 59 |     def fit(self, x, groups=None):
 60 |         self._fit(x, groups)
 61 |         return self
 62 | 
 63 |     def fit_transform(self, x, groups=None):
 64 |         self._fit(x, groups)
 65 |         return self.transform(x)
 66 | 
 67 |     def transform(self, x):
 68 |         transformed_x = deepcopy(x)
 69 |         is_df = hasattr(x, "columns")
 70 |         for column in self.x_columns_:
 71 |             group_index = self.find_group(column)
 72 |             if is_df:
 73 |                 transformed_x.loc[:, column] = self._transform_column(x[column], group_index)
 74 |             else:
 75 |                 transformed_x[:, column] = self._transform_column(x[:, column], group_index)
 76 |         return transformed_x
 77 | 
 78 |     def inverse_transform(self, x):
 79 |         transformed_x = deepcopy(x)
 80 |         is_df = hasattr(x, "columns")
 81 |         for column in self.x_columns_:
 82 |             group_index = self.find_group(column)
 83 |             if is_df:
 84 |                 transformed_x.loc[:, column] = self._inverse_transform_column(x[column], group_index)
 85 |             else:
 86 |                 transformed_x[:, column] = self._inverse_transform_column(x[:, column], group_index)
 87 |         return transformed_x
 88 | 
 89 |     def set_groups(self, x, groups):
 90 |         if groups is None:
 91 |             if hasattr(x, "columns"):
 92 |                 self.groups_ = list(x.columns)
 93 |                 self.x_columns_ = list(x.columns)
 94 |             else:
 95 |                 self.groups_ = list(range(x.shape[1]))
 96 |                 self.x_columns_ = list(range(x.shape[1]))
 97 |         else:
 98 |             self.groups_ = groups
 99 |             if hasattr(x, "columns"):
100 |                 self.x_columns_ = list(x.columns)
101 |             else:
102 |                 self.x_columns_ = list(range(x.shape[1]))
103 |         self.group_index_ = np.arange(len(self.groups_))
104 | 
105 |     def find_group(self, var_name):
106 |         group_index = -1
107 |         for g, group in enumerate(self.groups_):
108 |             if type(group) is not list and var_name == group:
109 |                 group_index = g
110 |             elif type(group) is list and var_name in group:
111 |                 group_index = g
112 |         assert group_index >= 0, var_name + " not found in groups."
113 |         return group_index
114 | 
115 |     def _fit(self, x, groups):
116 |         raise NotImplementedError
117 | 
118 |     def _transform_column(self, x, group_index):
119 |         raise NotImplementedError
120 | 
121 |     def _inverse_transform_column(self, x, group_index):
122 |         raise NotImplementedError
123 | 
124 | 
125 | class GroupStandardScaler(GroupBaseScaler):
126 |     """
127 |     Scaler that enables calculation and sharing of scaling parameters among multiple variables via variable groupings.
128 |     This is useful for situations where variables are related, such as temperatures at different height levels.
129 | 
130 |     Groups are specified as a list of column ids, which can be column names for pandas dataframes or column indices
131 |     for numpy arrays.
132 | 
133 |     For example:
134 |     ```
135 |     groups = [["a", "b"], ["c", "d"], "e"]
136 |     ```
137 |     "a" and "b" are a single group and all values of both will be included when calculating the mean and standard
138 |     deviation for that group.
139 |     """
140 |     def __init__(self):
141 |         self.center_ = None
142 |         self.scale_ = None
143 |         super().__init__()
144 | 
145 |     def _fit(self, x, groups=None):
146 |         self.set_groups(x, groups)
147 |         self.center_ = np.zeros(self.group_index_.shape)
148 |         self.scale_ = np.zeros(self.group_index_.shape)
149 |         is_df = hasattr(x, "columns")
150 |         for g in self.group_index_:
151 |             if is_df:
152 |                 self.center_[g] = np.mean(x[self.groups_[g]].values)
153 |                 self.scale_[g] = np.std(x[self.groups_[g]].values)
154 |             else:
155 |                 self.center_[g] = np.mean(x[:, self.groups_[g]])
156 |                 self.scale_[g] = np.std(x[:, self.groups_[g]])
157 | 
158 |         return
159 | 
160 |     def _transform_column(self, x_column, group_index):
161 |         return (x_column - self.center_[group_index]) / self.scale_[group_index]
162 | 
163 |     def _inverse_transform_column(self, x_column, group_index):
164 |         return x_column * self.scale_[group_index] + self.center_[group_index]
165 | 
166 | 
167 | class GroupMinMaxScaler(GroupBaseScaler):
168 |     """
169 |     Group version of MinMaxScaler
170 |     """
171 |     def __init__(self, feature_range=(0, 1)):
172 |         self.feature_range = feature_range
173 |         self.mins_ = None
174 |         self.maxes_ = None
175 |         GroupBaseScaler.__init__(self)
176 |         return
177 | 
178 |     def _fit(self, x, groups):
179 |         self.set_groups(x, groups)
180 |         self.mins_ = np.zeros(self.group_index_.shape)
181 |         self.maxes_ = np.zeros(self.group_index_.shape)
182 |         is_df = hasattr(x, "columns")
183 |         for g in self.group_index_:
184 |             if is_df:
185 |                 self.mins_[g] = np.min(x[self.groups_[g]].values)
186 |                 self.maxes_[g] = np.max(x[self.groups_[g]].values)
187 |             else:
188 |                 self.mins_[g] = np.min(x[:, self.groups_[g]])
189 |                 self.maxes_[g] = np.max(x[:, self.groups_[g]])
190 |         return
191 | 
192 |     def _transform_column(self, x_column, group_index):
193 |         x_normed = (x_column - self.mins_[group_index]) / (self.maxes_[group_index] - self.mins_[group_index])
194 |         return x_normed * (self.feature_range[1] - self.feature_range[0]) + self.feature_range[0]
195 | 
196 |     def _inverse_transform_column(self, x_column, group_index):
197 |         x_normed = (x_column - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0])
198 |         return x_normed * (self.maxes_[group_index] - self.mins_[group_index]) + self.mins_[group_index]
199 | 
200 | 
201 | class GroupRobustScaler(GroupBaseScaler):
202 |     """
203 |     Group version of RobustScaler
204 | 
205 |     """
206 |     def __init__(self, quartile_range=(25.0, 75.0)):
207 |         self.quartile_range = quartile_range
208 |         self.center_ = None
209 |         self.scale_ = None
210 |         super().__init__()
211 | 
212 |     def _fit(self, x, groups):
213 |         self.set_groups(x, groups)
214 |         self.center_ = np.zeros(self.group_index_.shape)
215 |         self.scale_ = np.zeros(self.group_index_.shape)
216 |         is_df = hasattr(x, "columns")
217 |         for g in self.group_index_:
218 |             if is_df:
219 |                 self.center_[g] = np.median(x[self.groups_[g]])
220 |                 self.scale_[g] = np.abs(np.quantile(x[self.groups_[g]], self.quartile_range[1] / 100.0) -
221 |                                         np.quantile(x[self.groups_[g]], self.quartile_range[0] / 100.0))
222 |             else:
223 |                 self.center_[g] = np.median(x[:, self.groups_[g]])
224 |                 self.scale_[g] = np.abs(np.quantile(x[:, self.groups_[g]], self.quartile_range[1] / 100.0) -
225 |                                         np.quantile(x[:, self.groups_[g]], self.quartile_range[0] / 100.0))
226 | 
227 |     def _transform_column(self, x_column, group_index):
228 |         return (x_column - self.center_[group_index]) / self.scale_[group_index]
229 | 
230 |     def _inverse_transform_column(self, x_column, group_index):
231 |         return x_column * self.scale_[group_index] + self.center_[group_index]
232 | 
233 | 
234 | 
235 | 
236 | 
237 | 
238 | 
239 | 
240 | 


--------------------------------------------------------------------------------
/bridgescaler/tests/distributed_test.py:
--------------------------------------------------------------------------------
  1 | from bridgescaler.distributed import DStandardScaler, DMinMaxScaler, DQuantileScaler
  2 | from bridgescaler import save_scaler, load_scaler, print_scaler, read_scaler
  3 | import numpy as np
  4 | import pandas as pd
  5 | import xarray as xr
  6 | import os
  7 | from multiprocessing import Pool
  8 | 
  9 | def make_test_data():
 10 |     np.random.seed(34325)
 11 |     test_data = dict()
 12 |     col_names = ["a", "b", "c", "d", "e"]
 13 |     test_data["means"] = np.array([0, 5.3, -2.421, 21456.3, 1.e-5])
 14 |     test_data["sds"] = np.array([5, 352.2, 1e-4, 20000.3, 5.3e-2])
 15 |     test_data["n_examples"] = np.array([1000, 500, 88])
 16 |     test_data["numpy_2d"] = []
 17 |     test_data["numpy_4d"] = []
 18 |     test_data["pandas"] = []
 19 |     test_data["xarray"] = []
 20 |     tile_width = 5
 21 |     for n in range(test_data["n_examples"].size):
 22 |         data2d = np.zeros((test_data["n_examples"][n], test_data["means"].size))
 23 |         data4d = np.zeros((test_data["n_examples"][n], tile_width, tile_width, test_data["means"].size))
 24 |         for i in range(test_data["means"].size):
 25 |             data2d[:, i] = np.random.normal(loc=test_data["means"][i],
 26 |                                             scale=test_data["sds"][i],
 27 |                                             size=test_data["n_examples"][n])
 28 |             data4d[..., i] = np.random.normal(loc=test_data["means"][i],
 29 |                                               scale=test_data["sds"][i],
 30 |                                               size=(test_data["n_examples"][n], tile_width, tile_width))
 31 |         test_data["numpy_2d"].append(data2d)
 32 |         test_data["numpy_4d"].append(data4d)
 33 |         test_data["pandas"].append(pd.DataFrame(data2d, columns=col_names, index=np.arange(data2d.shape[0])))
 34 |         test_data["xarray"].append(xr.DataArray(data4d,
 35 |                                                 dims=("batch", "y", "x", "variable"),
 36 |                                                 coords=dict(batch=np.arange(test_data["n_examples"][n]),
 37 |                                                             y=np.arange(tile_width),
 38 |                                                             x=np.arange(tile_width),
 39 |                                                             variable=col_names)))
 40 | 
 41 |     return test_data
 42 | 
 43 | 
 44 | # Create test datasets for use in all unit tests.
 45 | test_data = make_test_data()
 46 | 
 47 | def test_dstandard_scaler():
 48 |     all_ds_2d = np.vstack(test_data["numpy_2d"])
 49 |     all_ds_4d = np.vstack(test_data["numpy_4d"])
 50 |     dsses_2d = []
 51 |     dsses_4d = []
 52 |     for n in range(test_data["n_examples"].size):
 53 |         dsses_2d.append(DStandardScaler())
 54 |         dsses_2d[-1].fit(test_data["numpy_2d"][n])
 55 |         dsses_4d.append(DStandardScaler(channels_last=True))
 56 |         dsses_4d[-1].fit(test_data["numpy_4d"][n])
 57 |         save_scaler(dsses_2d[-1], "scaler.json")
 58 |         new_scaler = load_scaler("scaler.json")
 59 |         os.remove("scaler.json")
 60 |     pd_dss = DStandardScaler()
 61 |     pd_trans = pd_dss.fit_transform(test_data["pandas"][0])
 62 |     pd_inv_trans = pd_dss.inverse_transform(pd_trans)
 63 |     assert type(pd_trans) is type(test_data["pandas"][0]), "Pandas DataFrame type not passed through transform"
 64 |     assert type(pd_inv_trans) is type(test_data["pandas"][0]), "Pandas DataFrame type not passed through inverse"
 65 |     xr_dss = DStandardScaler()
 66 |     xr_trans = xr_dss.fit_transform(test_data["xarray"][0])
 67 |     xr_inv_trans = xr_dss.inverse_transform(xr_trans)
 68 |     assert type(xr_trans) is type(test_data["xarray"][0]), "Pandas DataFrame type not passed through transform"
 69 |     assert type(xr_inv_trans) is type(test_data["xarray"][0]), "Pandas DataFrame type not passed through inverse"
 70 |     dss_total_2d = np.sum(dsses_2d)
 71 |     dss_total_4d = np.sum(dsses_4d)
 72 |     mean_2d, var_2d = dss_total_2d.get_scales()
 73 |     mean_4d, var_4d = dss_total_4d.get_scales()
 74 |     all_2d_var = all_ds_2d.var(axis=0)
 75 |     all_4d_var = np.array([all_ds_4d[..., i].var() for i in range(all_ds_4d.shape[-1])])
 76 |     all_4d_mean = np.array([all_ds_4d[..., i].mean() for i in range(all_ds_4d.shape[-1])])
 77 |     assert mean_2d.shape[0] == test_data["means"].shape[0] and var_2d.shape[0] == test_data["sds"].shape[0], "Stat shape mismatch"
 78 |     assert mean_4d.shape[0] == test_data["means"].shape[0] and var_4d.shape[0] == test_data["sds"].shape[0], "Stat shape mismatch"
 79 |     assert np.max(np.abs(mean_2d - all_ds_2d.mean(axis=0))) < 1e-5, "significant difference in means"
 80 |     assert np.max(np.abs(var_2d - all_2d_var) / all_2d_var) < 1e-5, "significant difference in variances"
 81 |     assert np.max(np.abs(mean_4d - all_4d_mean) / all_4d_mean) < 1e-5, "significant difference in means"
 82 |     assert np.max(np.abs(var_4d - all_4d_var) / all_4d_var) < 1e-5, "significant difference in variances"
 83 |     sub_cols = ["d", "b"]
 84 |     pd_sub_trans = pd_dss.transform(test_data["pandas"][0][sub_cols])
 85 |     assert pd_sub_trans.shape[1] == len(sub_cols), "Did not subset properly"
 86 |     pd_sub_inv_trans = pd_dss.inverse_transform(pd_sub_trans)
 87 |     assert pd_sub_inv_trans.shape[1] == len(sub_cols), "Did not subset properly on inverse."
 88 | 
 89 | 
 90 | def test_dminmax_scaler():
 91 |     all_ds_2d = np.vstack(test_data["numpy_2d"])
 92 |     dsses_2d = []
 93 |     dsses_4d = []
 94 |     for n in range(test_data["n_examples"].size):
 95 |         dsses_2d.append(DMinMaxScaler())
 96 |         dsses_2d[-1].fit(test_data["numpy_2d"][n])
 97 |         dsses_4d.append(DMinMaxScaler())
 98 |         dsses_4d[-1].fit(test_data["numpy_4d"][n])
 99 |         save_scaler(dsses_2d[-1], "scaler.json")
100 |         new_scaler = load_scaler("scaler.json")
101 |         os.remove("scaler.json")
102 |     dss_total_2d = np.sum(dsses_2d)
103 |     dss_total_4d = np.sum(dsses_4d)
104 |     min_2d, max_2d = dss_total_2d.get_scales()
105 |     min_4d, max_4d = dss_total_4d.get_scales()
106 |     n_cols = test_data["numpy_2d"][0].shape[1]
107 |     pd_dss = DMinMaxScaler()
108 |     pd_trans = pd_dss.fit_transform(test_data["pandas"][0])
109 |     pd_inv_trans = pd_dss.inverse_transform(pd_trans)
110 |     sub_cols = ["d", "b"]
111 |     pd_sub_trans = pd_dss.transform(test_data["pandas"][0][sub_cols])
112 |     assert pd_sub_trans.shape[1] == len(sub_cols), "Did not subset properly"
113 |     pd_sub_inv_trans = pd_dss.inverse_transform(pd_sub_trans)
114 |     assert pd_sub_inv_trans.shape[1] == len(sub_cols), "Did not subset properly on inverse."
115 |     assert type(pd_trans) is type(test_data["pandas"][0]), "Pandas DataFrame type not passed through transform"
116 |     assert type(pd_inv_trans) is type(test_data["pandas"][0]), "Pandas DataFrame type not passed through inverse"
117 |     xr_dss = DMinMaxScaler()
118 |     xr_trans = xr_dss.fit_transform(test_data["xarray"][0])
119 |     xr_inv_trans = xr_dss.inverse_transform(xr_trans)
120 |     assert type(xr_trans) is type(test_data["xarray"][0]), "Pandas DataFrame type not passed through transform"
121 |     assert type(xr_inv_trans) is type(test_data["xarray"][0]), "Pandas DataFrame type not passed through inverse"
122 |     assert min_2d.shape[0] == n_cols and max_2d.shape[0] == n_cols, "Stat shape mismatch"
123 |     assert min_4d.shape[0] == n_cols and max_4d.shape[0] == n_cols, "Stat shape mismatch"
124 |     assert np.max(np.abs(min_2d - all_ds_2d.min(axis=0))) < 1e-8, "significant difference in means"
125 |     assert np.max(np.abs(max_2d - all_ds_2d.max(axis=0))) < 1e-8, "significant difference in variances"
126 | 
127 | 
128 | def test_dquantile_scaler():
129 |     dsses_2d = []
130 |     dsses_4d = []
131 |     pool = Pool(2)
132 |     for n in range(test_data["n_examples"].size):
133 |         dsses_2d.append(DQuantileScaler())
134 |         dsses_2d[-1].fit(test_data["numpy_2d"][n])
135 |         dsses_4d.append(DQuantileScaler())
136 |         dsses_4d[-1].fit(test_data["numpy_4d"][n])
137 |         ds_2d_transformed = dsses_2d[-1].transform(test_data["numpy_2d"][n], pool=pool)
138 |         ds_4d_transformed = dsses_4d[-1].transform(test_data["numpy_4d"][n], pool=pool)
139 |         ds_2d_it = dsses_2d[-1].inverse_transform(ds_2d_transformed, pool=pool)
140 |         ds_4d_it = dsses_4d[-1].inverse_transform(ds_4d_transformed, pool=pool)
141 |         assert ds_2d_transformed.max() <= 1, "Quantile transform > 1"
142 |         assert ds_4d_transformed.max() <= 1, "Quantile transform > 1"
143 |         save_scaler(dsses_2d[-1], "scaler.json")
144 |         new_scaler = load_scaler("scaler.json")
145 |         os.remove("scaler.json")
146 |         assert np.nanargmax(np.abs((new_scaler.min_ - dsses_2d[-1].min_))) == 0, \
147 |             "Differences in scaler centroid values after loading"
148 |     pd_dss = DQuantileScaler()
149 |     pd_trans = pd_dss.fit_transform(test_data["pandas"][0], pool=pool)
150 |     pd_inv_trans = pd_dss.inverse_transform(pd_trans, pool=pool)
151 |     sub_cols = ["d", "b"]
152 |     pd_sub_trans = pd_dss.transform(test_data["pandas"][0][sub_cols], pool=pool)
153 |     assert pd_sub_trans.shape[1] == len(sub_cols), "Did not subset properly"
154 |     pd_sub_inv_trans = pd_dss.inverse_transform(pd_sub_trans, pool=pool)
155 |     assert pd_sub_inv_trans.shape[1] == len(sub_cols), "Did not subset properly on inverse."
156 |     assert type(pd_trans) is type(test_data["pandas"][0]), "Pandas DataFrame type not passed through transform"
157 |     assert type(pd_inv_trans) is type(test_data["pandas"][0]), "Pandas DataFrame type not passed through inverse"
158 |     xr_dss = DQuantileScaler(distribution="normal")
159 |     xr_trans = xr_dss.fit_transform(test_data["xarray"][0], pool=pool)
160 |     xr_inv_trans = xr_dss.inverse_transform(xr_trans, pool=pool)
161 |     assert np.all(~np.isnan(xr_trans)), "nans in transform"
162 |     assert np.all(~np.isnan(xr_inv_trans)), "nans in inverse transform"
163 |     assert xr_trans.shape == test_data["xarray"][0].shape, "shape does not match"
164 |     assert xr_inv_trans.shape == test_data["xarray"][0].shape, "shape does not match"
165 | 
166 |     # assert np.max(np.abs(xr_inv_trans.values - test_data["xarray"][0].values)) < 1e-3, "Differences in transform"
167 |     combined_scaler = np.sum(dsses_2d)
168 |     assert combined_scaler.size_[0] == test_data["n_examples"].sum(), \
169 |         "Summing did not work properly."
170 |     test_data_c_first = test_data["xarray"][0].transpose("batch", "variable", "y", "x").astype("float32")
171 |     xr_dss_first = xr_dss.transform(test_data_c_first, channels_last=False, pool=pool)
172 |     xr_inv_dss_first = xr_dss.inverse_transform(xr_dss_first, channels_last=False, pool=pool)
173 |     assert xr_dss_first.shape == xr_inv_dss_first.shape, "shape does not match"
174 |     xr_dss_f = DQuantileScaler(distribution="normal", channels_last=False)
175 |     xr_dss_f.fit(test_data_c_first)
176 |     scaled_data_quantile_first = xr_dss_f.transform(test_data_c_first, pool=pool)
177 |     assert scaled_data_quantile_first.shape == test_data_c_first.shape
178 |     if pool is not None:
179 |         pool.close()
180 |         pool.join()
181 |     return
182 | 
183 | if __name__ == "__main__":
184 |     from time import perf_counter
185 |     start = perf_counter()
186 |     test_data_c_first = test_data["xarray"][0].transpose("batch", "variable", "y", "x").astype("float32")
187 |     xr_dss_f = DQuantileScaler(distribution="normal", channels_last=False)
188 |     xr_dss_f.fit(test_data_c_first, n_jobs=16)
189 |     scaled_data_quantile_first = xr_dss_f.transform(test_data_c_first, n_jobs=16)
190 |     stop = perf_counter()
191 |     print(stop - start)
192 | 


--------------------------------------------------------------------------------
/bridgescaler/distributed_tensor.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | import importlib.util
  3 | 
  4 | from packaging import version
  5 | import torch
  6 | 
  7 | REQUIRED_VERSION = "2.0.0"  # required torch version
  8 | 
  9 | # Check if PyTorch is installed
 10 | if importlib.util.find_spec("torch") is None:
 11 |     raise ImportError("PyTorch is not installed")
 12 | 
 13 | installed_version = torch.__version__
 14 | 
 15 | # Validate version
 16 | if version.parse(installed_version) < version.parse(REQUIRED_VERSION):
 17 |     raise RuntimeError(
 18 |         f"PyTorch version mismatch: required {REQUIRED_VERSION}, "
 19 |         f"found {installed_version}"
 20 |     )
 21 | 
 22 | 
 23 | class DBaseScalerTensor:
 24 |     """
 25 |     Base distributed scaler class for tensor. Used only to store attributes and methods shared across all distributed
 26 |     scaler subclasses.
 27 |     """
 28 | 
 29 |     def __init__(self, channels_last=True):
 30 |         self.x_columns_ = None
 31 |         self._fit = False
 32 |         self.channels_last = channels_last
 33 | 
 34 |     def is_fit(self):
 35 |         return self._fit
 36 | 
 37 |     @staticmethod
 38 |     def extract_x_columns(x, channels_last=True):
 39 |         """
 40 |         Extract column indices to be transformed from x. All of these assume that the columns are in the last dimension.
 41 | 
 42 |         Args:
 43 |             x (torch.tensor): tensor of values to be transformed.
 44 |             channels_last (bool): If True, then assume the variable or channel dimension is the last dimension of the
 45 |                 array. If False, then assume the variable or channel dimension is second.
 46 | 
 47 |         Returns:
 48 |             x_columns (torch.tensor): tensor of column indices.
 49 |         """
 50 |         var_dim_num = -1
 51 |         if not channels_last:
 52 |             var_dim_num = 1
 53 |         assert isinstance(x, torch.Tensor), "Input must be a PyTorch tensor"
 54 |         x_columns = torch.arange(x.shape[var_dim_num])
 55 |         return x_columns
 56 | 
 57 |     def set_channel_dim(self, channels_last=None):
 58 |         if channels_last is None:
 59 |             channels_last = self.channels_last
 60 |         if channels_last:
 61 |             channel_dim = -1
 62 |         else:
 63 |             channel_dim = 1
 64 |         return channel_dim
 65 | 
 66 |     def process_x_for_transform(self, x, channels_last=None):
 67 |         if channels_last is None:
 68 |             channels_last = self.channels_last
 69 |         channel_dim = self.set_channel_dim(channels_last)
 70 |         assert self._fit, "Scaler has not been fit."
 71 |         assert (
 72 |             x.shape[channel_dim] == self.x_columns_.shape[0]
 73 |         ), "Number of input columns does not match scaler."
 74 |         x_col_order = torch.arange(x.shape[channel_dim])
 75 |         xv = x
 76 |         x_transformed = torch.zeros(xv.shape, dtype=xv.dtype)
 77 |         return xv, x_transformed, channels_last, channel_dim, x_col_order
 78 | 
 79 |     def fit(self, x, weight=None):
 80 |         pass
 81 | 
 82 |     def transform(self, x, channels_last=None):
 83 |         pass
 84 | 
 85 |     def fit_transform(self, x, channels_last=None, weight=None):
 86 |         self.fit(x, weight=weight)
 87 |         return self.transform(x, channels_last=channels_last)
 88 | 
 89 |     def inverse_transform(self, x, channels_last=None):
 90 |         pass
 91 | 
 92 |     def __add__(self, other):
 93 |         pass
 94 | 
 95 |     def subset_columns(self, sel_columns):
 96 |         pass
 97 | 
 98 |     def add_variables(self, other):
 99 |         pass
100 | 
101 | 
102 | class DStandardScalerTensor(DBaseScalerTensor):
103 |     """
104 |     Distributed version of StandardScaler. You can calculate this map-reduce style by running it on individual
105 |     data files, returning the fitted objects, and then summing them together to represent the full dataset. Scaler
106 |     supports torch.tensor and returns a transformed tensor.
107 |     """
108 | 
109 |     def __init__(self, channels_last=True):
110 |         self.mean_x_ = None
111 |         self.n_ = 0
112 |         self.var_x_ = None
113 |         super().__init__(channels_last=channels_last)
114 | 
115 |     def fit(self, x, weight=None):
116 |         x_columns = self.extract_x_columns(x, channels_last=self.channels_last)
117 |         xv = x
118 |         channel_dim = self.set_channel_dim()
119 |         if not self._fit:
120 |             self.x_columns_ = x_columns
121 |             if len(xv.shape) > 2:
122 |                 if self.channels_last:
123 |                     self.n_ += torch.prod(torch.tensor(xv.shape[:-1]))
124 |                 else:
125 |                     self.n_ += xv.shape[0] * \
126 |                         torch.prod(torch.tensor(xv.shape[2:]))
127 |             else:
128 |                 self.n_ += xv.shape[0]
129 |             self.mean_x_ = torch.zeros(xv.shape[channel_dim], dtype=xv.dtype)
130 |             self.var_x_ = torch.zeros(xv.shape[channel_dim], dtype=xv.dtype)
131 | 
132 |             if self.channels_last:
133 |                 for i in range(xv.shape[channel_dim]):
134 |                     self.mean_x_[i] = torch.mean(xv[..., i])
135 |                     self.var_x_[i] = torch.var(xv[..., i], correction=0)
136 |             else:
137 |                 for i in range(xv.shape[channel_dim]):
138 |                     self.mean_x_[i] = torch.mean(xv[:, i])
139 |                     self.var_x_[i] = torch.var(xv[:, i], correction=0)
140 | 
141 |         else:
142 |             # Update existing scaler with new data
143 |             assert (
144 |                 x.shape[channel_dim] == self.x_columns_.shape[0]
145 |             ), "New data has a different number of columns"
146 |             if self.channels_last:
147 |                 x_col_order = torch.arange(x.shape[-1])
148 |             else:
149 |                 x_col_order = torch.arange(x.shape[1])
150 |             if len(xv.shape) > 2:
151 |                 if self.channels_last:
152 |                     new_n = torch.prod(torch.tensor(xv.shape[:-1]))
153 |                 else:
154 |                     new_n = xv.shape[0] * \
155 |                         torch.prod(torch.tensor(xv.shape[2:]))
156 |             else:
157 |                 new_n = xv.shape[0]
158 |             for i, o in enumerate(x_col_order):
159 |                 if self.channels_last:
160 |                     new_mean = torch.mean(xv[..., i])
161 |                     new_var = torch.var(xv[..., i], correction=0)
162 |                 else:
163 |                     new_mean = torch.mean(xv[:, i])
164 |                     new_var = torch.var(xv[:, i], correction=0)
165 |                 combined_mean = (self.n_ * self.mean_x_[o] + new_n * new_mean) / (
166 |                     self.n_ + new_n
167 |                 )
168 |                 weighted_var = (self.n_ * self.var_x_[o] + new_n * new_var) / (
169 |                     self.n_ + new_n
170 |                 )
171 |                 var_correction = (
172 |                     self.n_ * new_n * (self.mean_x_[o] - new_mean) ** 2
173 |                 ) / ((self.n_ + new_n) ** 2)
174 |                 self.mean_x_[o] = combined_mean
175 |                 self.var_x_[o] = weighted_var + var_correction
176 |             self.n_ += new_n
177 |         self._fit = True
178 | 
179 |     def transform(self, x, channels_last=None):
180 |         """
181 |         Transform the input data from its original form to standard scaled form. If your input data has a
182 |         different dimension order than the data used to fit the scaler, use the channels_last keyword argument
183 |         to specify whether the new data are `channels_last` (True) or `channels_first` (False).
184 | 
185 |         Args:
186 |             x (torch.tensor): Input data.
187 |             channels_last: Override the default channels_last parameter of the scaler.
188 | 
189 |         Returns:
190 |             x_transformed (torch.tensor): Transformed data in the same shape and type as x.
191 |         """
192 |         (
193 |             xv,
194 |             x_transformed,
195 |             channels_last,
196 |             channel_dim,
197 |             x_col_order,
198 |         ) = self.process_x_for_transform(x, channels_last)
199 |         x_mean, x_var = self.get_scales()
200 |         if channels_last:
201 |             for i, o in enumerate(x_col_order):
202 |                 x_transformed[..., i] = (
203 |                     xv[..., i] - x_mean[o]) / torch.sqrt(x_var[o])
204 |         else:
205 |             for i, o in enumerate(x_col_order):
206 |                 x_transformed[:, i] = (
207 |                     xv[:, i] - x_mean[o]) / torch.sqrt(x_var[o])
208 |         return x_transformed
209 | 
210 |     def inverse_transform(self, x, channels_last=None):
211 |         (
212 |             xv,
213 |             x_transformed,
214 |             channels_last,
215 |             channel_dim,
216 |             x_col_order,
217 |         ) = self.process_x_for_transform(x, channels_last)
218 |         x_mean, x_var = self.get_scales()
219 |         if channels_last:
220 |             for i, o in enumerate(x_col_order):
221 |                 x_transformed[..., i] = xv[..., i] * \
222 |                     torch.sqrt(x_var[o]) + x_mean[o]
223 |         else:
224 |             for i, o in enumerate(x_col_order):
225 |                 x_transformed[:, i] = xv[:, i] * \
226 |                     torch.sqrt(x_var[o]) + x_mean[o]
227 |         return x_transformed
228 | 
229 |     def get_scales(self):
230 |         return self.mean_x_, self.var_x_
231 | 
232 |     def __add__(self, other):
233 |         assert (
234 |             type(other) is DStandardScalerTensor
235 |         ), "Input is not DStandardScalerTensor"
236 |         assert torch.all(
237 |             other.x_columns_ == self.x_columns_
238 |         ), "Scaler columns do not match."
239 |         current = deepcopy(self)
240 |         current.mean_x_ = (self.n_ * self.mean_x_ + other.n_ * other.mean_x_) / (
241 |             self.n_ + other.n_
242 |         )
243 |         combined_var = (self.n_ * self.var_x_ + other.n_ * other.var_x_) / (
244 |             self.n_ + other.n_
245 |         )
246 |         combined_var_corr = (
247 |             self.n_ * other.n_ * (self.mean_x_ - other.mean_x_) ** 2
248 |         ) / ((self.n_ + other.n_) ** 2)
249 |         current.var_x_ = combined_var + combined_var_corr
250 |         current.n_ = self.n_ + other.n_
251 |         return current
252 | 
253 | 
254 | class DMinMaxScalerTensor(DBaseScalerTensor):
255 |     """
256 |     Distributed MinMaxScaler enables calculation of min and max of variables in datasets in parallel, then combining
257 |     the mins and maxes as a reduction step. Scaler
258 |     supports torch.tensor and will return a transformed array in the
259 |     same form as the original with column or coordinate names preserved.
260 |     """
261 | 
262 |     def __init__(self, channels_last=True):
263 |         self.max_x_ = None
264 |         self.min_x_ = None
265 |         super().__init__(channels_last=channels_last)
266 | 
267 |     def fit(self, x, weight=None):
268 |         x_columns = self.extract_x_columns(x, channels_last=self.channels_last)
269 |         xv = x
270 |         channel_dim = self.set_channel_dim()
271 |         if not self._fit:
272 |             self.x_columns_ = x_columns
273 |             self.max_x_ = torch.zeros(xv.shape[channel_dim], dtype=xv.dtype)
274 |             self.min_x_ = torch.zeros(xv.shape[channel_dim], dtype=xv.dtype)
275 | 
276 |             if self.channels_last:
277 |                 for i in range(xv.shape[channel_dim]):
278 |                     self.max_x_[i] = torch.max(xv[..., i])
279 |                     self.min_x_[i] = torch.min(xv[..., i])
280 |             else:
281 |                 for i in range(xv.shape[channel_dim]):
282 |                     self.max_x_[i] = torch.max(xv[:, i])
283 |                     self.min_x_[i] = torch.min(xv[:, i])
284 |         else:
285 |             # Update existing scaler with new data
286 |             assert (
287 |                 x.shape[channel_dim] == self.x_columns_.shape[0]
288 |             ), "New data has a different number of columns"
289 |             if self.channels_last:
290 |                 x_col_order = torch.arange(x.shape[-1])
291 |             else:
292 |                 x_col_order = torch.arange(x.shape[1])
293 |             if self.channels_last:
294 |                 for i, o in enumerate(x_col_order):
295 |                     self.max_x_[o] = torch.maximum(
296 |                         self.max_x_[o], torch.max(xv[..., i])
297 |                     )
298 |                     self.min_x_[o] = torch.minimum(
299 |                         self.min_x_[o], torch.min(xv[..., i])
300 |                     )
301 |             else:
302 |                 for i, o in enumerate(xv.shape[channel_dim]):
303 |                     self.max_x_[o] = torch.maximum(
304 |                         self.max_x_[o], torch.max(xv[:, i]))
305 |                     self.min_x_[o] = torch.minimum(
306 |                         self.min_x_[o], torch.min(xv[:, i]))
307 |         self._fit = True
308 | 
309 |     def transform(self, x, channels_last=None):
310 |         (
311 |             xv,
312 |             x_transformed,
313 |             channels_last,
314 |             channel_dim,
315 |             x_col_order,
316 |         ) = self.process_x_for_transform(x, channels_last)
317 |         if channels_last:
318 |             for i, o in enumerate(x_col_order):
319 |                 x_transformed[..., i] = (xv[..., i] - self.min_x_[o]) / (
320 |                     self.max_x_[o] - self.min_x_[o]
321 |                 )
322 |         else:
323 |             for i, o in enumerate(x_col_order):
324 |                 x_transformed[:, i] = (xv[:, i] - self.min_x_[o]) / (
325 |                     self.max_x_[o] - self.min_x_[o]
326 |                 )
327 |         return x_transformed
328 | 
329 |     def inverse_transform(self, x, channels_last=None):
330 |         (
331 |             xv,
332 |             x_transformed,
333 |             channels_last,
334 |             channel_dim,
335 |             x_col_order,
336 |         ) = self.process_x_for_transform(x, channels_last)
337 |         if channels_last:
338 |             for i, o in enumerate(x_col_order):
339 |                 x_transformed[..., i] = (
340 |                     xv[..., i] * (self.max_x_[o] - self.min_x_[o]
341 |                                   ) + self.min_x_[o]
342 |                 )
343 |         else:
344 |             for i, o in enumerate(x_col_order):
345 |                 x_transformed[:, i] = (
346 |                     xv[:, i] * (self.max_x_[o] - self.min_x_[o]) +
347 |                     self.min_x_[o]
348 |                 )
349 |         return x_transformed
350 | 
351 |     def get_scales(self):
352 |         return self.min_x_, self.max_x_
353 | 
354 |     def __add__(self, other):
355 |         assert type(other) is DMinMaxScalerTensor, "Input is not DMinMaxScaler"
356 |         assert torch.all(
357 |             other.x_columns_ == self.x_columns_
358 |         ), "Scaler columns do not match."
359 |         current = deepcopy(self)
360 |         current.max_x_ = torch.maximum(self.max_x_, other.max_x_)
361 |         current.min_x_ = torch.minimum(self.min_x_, other.min_x_)
362 |         return current
363 | 


--------------------------------------------------------------------------------
/notebooks/Bridgscaler_intro.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "63310dd8-a974-4bcb-8d91-2250077b548e",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Bridgescaler Introduction"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "32c43d07-5af2-47c3-9656-fc11f640f66c",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "This is a short notebook covering some of the features and use cases of bridgescaler. The main repoistory can be found [here](https://github.com/NCAR/bridgescaler).\n",
 17 |     "\n",
 18 |     "Bridgescaler is desigend to add some functionality to scikit-learn pre-processors. "
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "2d0b7dda-e7af-4fab-9069-2690c860f0f1",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "#### Install"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 1,
 32 |    "id": "7848129f-d7b5-48d3-8da4-6e5b09026777",
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "# uncomment the below line if you need to install 2\n",
 37 |     "# !pip install bridgescaler"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "id": "9b07b11c-8981-4e58-a8fa-a06f07cca893",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "## Imports"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 2,
 51 |    "id": "838e6400-b2b9-43d0-8a31-844dba199f7f",
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "import numpy as np\n",
 56 |     "import pandas as pd\n",
 57 |     "import matplotlib.pyplot as plt\n",
 58 |     "\n",
 59 |     "import bridgescaler \n",
 60 |     "from bridgescaler import save_scaler, load_scaler\n",
 61 |     "from bridgescaler.group import GroupStandardScaler\n",
 62 |     "from sklearn.preprocessing import QuantileTransformer"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "id": "36dd5aaf-1555-46a0-b0eb-76546d72c63c",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "## Numpy Example"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 3,
 76 |    "id": "634f7659-21c1-4f96-ab3c-237ea85ee588",
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "# Define mean and standard deviation\n",
 81 |     "mean = 500  # This is approximately the midpoint of 10 and 1000\n",
 82 |     "std_dev = 300  # This value is chosen to spread values roughly within the desired range\n",
 83 |     "\n",
 84 |     "# Generate the array\n",
 85 |     "gaussian_array = np.random.normal(mean, std_dev, 10000)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 4,
 91 |    "id": "f7be8c82-9f58-4835-8f95-5606b7ac6850",
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "name": "stdout",
 96 |      "output_type": "stream",
 97 |      "text": [
 98 |       "Remaining array shape: (8000,)\n",
 99 |       "Random samples shape: (2000,)\n"
100 |      ]
101 |     },
102 |     {
103 |      "data": {
104 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAigAAAGdCAYAAAA44ojeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/H5lhTAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAzCklEQVR4nO3deVhV5aLH8d9WBhFxCyJsOCLi7CMOpSeVyjEHTmpF9zh0jmmZN3MoUrPMa1KPOZ1EKzMbzKlMvaXVPZqKj/OUSnpzqqMpDiVShqBmjOv+4XUdNyAKbtgL+H6eZz0Pa613rfWu1y37x7vetZbNMAxDAAAAFlLJ3RUAAADIi4ACAAAsh4ACAAAsh4ACAAAsh4ACAAAsh4ACAAAsh4ACAAAsh4ACAAAsx8PdFSiO3Nxc/fzzz/Lz85PNZnN3dQAAwG0wDEOXLl1SaGioKlUqvI+kTAaUn3/+WWFhYe6uBgAAKIYzZ86odu3ahZYpkwHFz89P0rUTrF69uptrAwAAbkd6errCwsLM7/HClMmAcv2yTvXq1QkoAACUMbczPINBsgAAwHIIKAAAwHIIKAAAwHLK5BgUAMA1OTk5ysrKcnc1AJOnp6cqV658x/shoABAGXX58mWdPXtWhmG4uyqAyWazqXbt2qpWrdod7YeAAgBlUE5Ojs6ePauqVauqVq1aPLQSlmAYhn755RedPXtWDRs2vKOeFAIKAJRBWVlZMgxDtWrVko+Pj7urA5hq1aqlpKQkZWVl3VFAYZAsAJRh9JzAalz1mSSgAAAAyyGgAAAqrMGDB+vhhx8u0jY2m01ffPFFidQH/8YYFAAoR+q+tLpUj5c07cFSPZ6rvfnmm0W+C+rcuXPy9/cvoRrhOgIKAMBtMjMz5eXl5bbj2+32Im/jcDhKoCZ3LisrS56enre93Oq4xAMAKDWdOnXSyJEjNXr0aAUGBqpbt26SpCNHjugvf/mLqlWrpuDgYA0cOFC//vqr03ajRo1SbGys/P39FRwcrPfff19XrlzRE088IT8/P9WvX19ff/21uU1OTo6GDBmiiIgI+fj4qHHjxnrzzTed6pP3Ek+nTp307LPPaty4cQoICJDD4VBcXJzTNjde4klKSpLNZtPKlSvVuXNnVa1aVS1bttSuXbuctvnggw8UFhamqlWr6pFHHlF8fLxq1KhRaFu9+OKLatSokapWrap69epp4sSJTg/li4uLU6tWrfTRRx+pXr168vb2lmEYstlsmjdvnh566CH5+vpq8uTJt2yLrVu3ytPTU8nJyU51GDNmjDp06FBoPUsKAQUAUKoWLVokDw8P7dixQ++9957OnTunjh07qlWrVtq3b5/Wrl2r8+fPq2/fvvm2CwwM1J49ezRq1Cg988wz+utf/6qoqCh9++236tGjhwYOHKjff/9dkpSbm6vatWtrxYoVOnLkiF555RW9/PLLWrFixS3r5+vrq2+++UYzZszQa6+9poSEhEK3mTBhgsaOHasDBw6oUaNGGjBggLKzsyVJO3bs0LBhw/Tcc8/pwIED6tatm15//fVbtpOfn58WLlyoI0eO6M0339QHH3ygWbNmOZU5fvy4VqxYoc8//1wHDhwwl0+aNEkPPfSQDh48qCeffPKWbdGhQwfVq1dPS5YsMfeRnZ2tjz/+WE888cQt61oSbEYZfARhenq67Ha70tLSVL16dXdXByjzChq3UNbHFpR3f/zxh06ePKmIiAhVqVLFXG71MSidOnVSWlqa9u/fby575ZVX9M0332jdunXmsrNnzyosLEw//PCDGjVqpE6dOiknJ0fbtm2TdK13xG63KyYmRosXL5YkJScnKyQkRLt27VK7du0KPP6IESN0/vx5ffbZZ5Ku9aBcvHjR7BHJexxJuueee9SlSxdNmzZN0rUelFWrVunhhx9WUlKSIiIi9OGHH2rIkCGSrvUGNWvWTEePHlWTJk3Uv39/Xb58Wf/85z/Nff7973/XP//5T128ePG22+4f//iHli9frn379km61oMyZcoU/fTTT6pVq5ZZzmazKTY2Nl+YuVVbzJgxwwxEkvTll1/q73//u5KTk+Xr63vb9bzZZ1Mq2vc3PSgAgFLVpk0bp/nExERt2rRJ1apVM6cmTZpIkn788UezXIsWLcyfK1eurJo1a6p58+bmsuDgYElSSkqKuWzevHlq06aNatWqpWrVqumDDz7Q6dOnC63fjceRpJCQEKd93mqbkJAQp3r88MMPuueee5zK550vyGeffab77rtPDodD1apV08SJE/PVPTw83CmcXJe3jaVbt8XgwYN1/Phx7d69W5L00UcfqW/fvkUKJ67EIFkAQKnK+4WXm5ur3r17a/r06fnKXv+yl5RvoKfNZnNadv0BYbm5uZKkFStW6Pnnn9fMmTPVvn17+fn56R//+Ie++eabQutX0HGu7/N2tslbj+vjQm50q4sXu3fvVv/+/fXqq6+qR48estvtWrZsmWbOnOlU7mbhIe/y22mLoKAg9e7dWwsWLFC9evW0Zs0abd68udB6liQCCgDAre6++259/vnnqlu3rjw8XPe1tG3bNkVFRWn48OHmsht7ZEpLkyZNtGfPHqdl1y/T3MyOHTsUHh6uCRMmmMtOnTpV7Drcbls89dRT6t+/v2rXrq369evr3nvvLfYx7xSXeAAAbjVixAj99ttvGjBggPbs2aMTJ05o/fr1evLJJ5WTk1Ps/TZo0ED79u3TunXr9K9//UsTJ07U3r17XVjz2zNq1CitWbNG8fHxOnbsmN577z19/fXXhT4SvkGDBjp9+rSWLVumH3/8UW+99ZZWrVpV7Drcbltc762ZPHmy2wbHXkdAAQC4VWhoqHbs2KGcnBz16NFDkZGReu6552S321WpUvG/poYNG6aYmBj169dPbdu21YULF5x6EErLvffeq3nz5ik+Pl4tW7bU2rVr9fzzz+cbQHqjhx56SM8//7xGjhypVq1aaefOnZo4cWKx63C7bVGpUiUNHjxYOTk5evzxx4t9PFfgLh4A3MVTBhV2pwSsb+jQofr++++d7hayiqFDh+r8+fP66quvirW9q+7iYQwKAAAl7I033lC3bt3k6+urr7/+WosWLdLcuXPdXS0naWlp2rt3rz755BN9+eWX7q4OAQUAgJK2Z88ezZgxQ5cuXVK9evX01ltv6amnnnJ3tZw89NBD2rNnj55++mnzCb/uREABAKCE3erptVbgzluKC8IgWQAAYDkEFAAAYDlc4gFQovLeIcTdQQBuBz0oAADAcggoAADAcggoAADAcggoAIBya/DgwXr44YfdXQ3L2Lx5s2w2my5evHhH+6lbt65mz57tkjrdDINkAaA8ibOX8vHSSvd4qDDoQQEAuE1mZqa7qwCLIqAAAEpNp06dNHLkSI0ePVqBgYHmI9Xj4+PVvHlz+fr6KiwsTMOHD9fly5fN7RYuXKgaNWpo3bp1atq0qapVq6aePXvq3LlzZpmcnByNHj1aNWrUUM2aNTVu3DjlfR9uRkaGnn32WQUFBalKlSq67777tHfvXnP99Usg69at01133SUfHx916dJFKSkp+vrrr9W0aVNVr15dAwYM0O+//37T8zx16pR69+4tf39/+fr6qlmzZlqzZo1ZzyFDhigiIkI+Pj5q3Lix3nzzTaftr1+amjJlioKDg1WjRg29+uqrys7O1gsvvKCAgADVrl1bH330kblNUlKSbDabli1bpqioKFWpUkXNmjW75RNid+7cqQ4dOsjHx0dhYWF69tlndeXKFXN9SkqKevfuLR8fH0VEROiTTz4pdH+uQkABAJSqRYsWycPDQzt27NB7770nSapUqZLeeustHTp0SIsWLdLGjRs1btw4p+1+//13vfHGG1qyZIm2bt2q06dPa+zYseb6mTNn6qOPPtL8+fO1fft2/fbbb1q1apXTPsaNG6fPP/9cixYt0rfffqsGDRqoR48e+u2335zKxcXFac6cOdq5c6fOnDmjvn37avbs2Vq6dKlWr16thIQEvf322zc9xxEjRigjI0Nbt27VwYMHNX36dFWrVk2SlJubq9q1a2vFihU6cuSIXnnlFb388sv5Hoe/ceNG/fzzz9q6davi4+MVFxenXr16yd/fX998842GDRumYcOG6cyZM07bvfDCCxozZoz279+vqKgo9enTRxcuXCiwngcPHlSPHj0UExOj7777TsuXL9f27ds1cuRIs8zgwYOVlJSkjRs36rPPPtPcuXOVkpJy03N3FZuRN16WAUV5XTOAW8v7MDXJdQ9U40FtJeOmr7S3+BiUTp06KS0tTfv37y+03H//93/rmWee0a+//irpWg/KE088oePHj6t+/fqSpLlz5+q1115TcnKyJCk0NFTPPfecXnzxRUlSdna2IiIi1Lp1a33xxRe6cuWK/P39tXDhQj322GOSpKysLNWtW1exsbF64YUXtHnzZnXu3FkbNmxQ165dJUnTpk3T+PHj9eOPP6pevXqSpGHDhikpKUlr164tsP4tWrTQo48+qkmTJt1Wu4wYMULnz5/XZ599JulaKNi8ebNOnDihSpWu9SU0adJEQUFB2rp1q6RrPTF2u10ffvih+vfvr6SkJEVERGjatGn52mDUqFEaN26ceX6pqamqUaOGHn/8cfn4+JhBUZK2b9+ujh076sqVKzp9+rQaN26s3bt3q23btpKk77//Xk2bNtWsWbMUGxub71xu+tlU0b6/GSQLVEAFBRKgtLRp0ybfsk2bNmnKlCk6cuSI0tPTlZ2drT/++ENXrlyRr6+vJKlq1apmOJGkkJAQ8y/5tLQ0nTt3Tu3btzfXe3h4qE2bNuZlnh9//FFZWVm69957zTKenp665557dPToUaf6tGjRwvw5ODhYVatWNcPJ9WV79uy56Tk+++yzeuaZZ7R+/Xo98MADevTRR532OW/ePH344Yc6deqUrl69qszMTLVq1cppH82aNTPDyfVjRkZGmvOVK1dWzZo18/VmFNQGec/vusTERB0/ftzpso1hGMrNzdXJkyf1r3/9y9zHdU2aNFGNGjVueu6uwiUeAGVC3ZdWO00ou64HjutOnTqlv/zlL4qMjNTnn3+uxMREvfPOO5Ku9XBc5+np6bSdzWbLN8akMNfL2my2fMvzLrvxWDabrcBj5+bm3vRYTz31lE6cOKGBAwfq4MGDatOmjXlJaMWKFXr++ef15JNPav369Tpw4ICeeOKJfAOGCzpmUetxY7mC5Obm6umnn9aBAwfM6X//93917Ngx1a9f/6ZtVhoIKAAAt9q3b5+ys7M1c+ZMtWvXTo0aNdLPP/9cpH3Y7XaFhIRo9+7d5rLs7GwlJiaa8w0aNJCXl5e2b99uLsvKytK+ffvUtGnTOz+RPMLCwjRs2DCtXLlSY8aM0QcffCBJ2rZtm6KiojR8+HDdddddatCggX788UeXHbegNmjSpEmBZe+++24dPnxYDRo0yDd5eXmpadOmys7O1r59+8xtfvjhhzt+jsrt4BIPAMCt6tevr+zsbL399tvq3bu3duzYoXnz5hV5P88995ymTZumhg0bqmnTpoqPj3f6IvX19dUzzzxj3gVTp04dzZgxQ7///ruGDBniwjOSYmNjFR0drUaNGik1NVUbN240Q1CDBg20ePFirVu3ThEREVqyZIn27t2riIgIlxz7nXfeMdtg1qxZSk1N1ZNPPllg2RdffFHt2rXTiBEjNHToUPn6+uro0aPmIODGjRurZ8+eGjp0qN5//315eHgoNjZWPj4+LqlrYehBAQC4VatWrRQfH6/p06crMjJSn3zyiaZOnVrk/YwZM0aPP/64Bg8erPbt28vPz0+PPPKIU5lp06bp0Ucf1cCBA3X33Xfr+PHjWrdunfz9/V11OpKuDWAdMWKEmjZtqp49e6px48aaO3eupGsDbGNiYtSvXz+1bdtWFy5c0PDhw1127GnTpmn69Olq2bKltm3bpi+//FKBgYEFlm3RooW2bNmiY8eO6f7779ddd92liRMnKiQkxCyzYMEChYWFqWPHjoqJidF//ud/KigoyGX1vRnu4gEqoNsZw2G1u3i4G8hZYXdKoGK6fhfP/v378w24LU2uuouHHhQAAGA5BBQAAGA5DJIFAKAcqFu3bpFuu7Y6elAAAIDlEFAAAIDlEFAAoAwrT136KB9c9ZkkoABAGVS5cmVJyvd4dMDdrn8mr39Gi6tIg2SnTp2qlStX6vvvv5ePj4+ioqI0ffp0NW7c2CwzePBgLVq0yGm7tm3bOj16NyMjQ2PHjtWnn36qq1evqmvXrpo7d65q1659RycDoHQV59kkJfnm5IrEw8NDVatW1S+//CJPT0+nl8oB7pKbm6tffvlFVatWlYfHnd2HU6Stt2zZohEjRujPf/6zsrOzNWHCBHXv3l1HjhxxevlTz549tWDBAnPey8vLaT+xsbH6n//5Hy1btkw1a9bUmDFj1KtXLyUmJt5x4gKAisBmsykkJEQnT57UqVOn3F0dwFSpUiXVqVPnjl8wWKSAsnbtWqf5BQsWKCgoSImJierQoYO53NvbWw6Ho8B9pKWlaf78+VqyZIkeeOABSdLHH3+ssLAwbdiwQT169CjqOQBAheTl5aWGDRtymQeW4uXl5ZIevTvqf0lLS5MkBQQEOC3fvHmzgoKCVKNGDXXs2FGvv/66+dz+xMREZWVlqXv37mb50NBQRUZGaufOnQUGlIyMDGVkZJjz6enpd1JtACg3KlWqxKPuUS4VO+IYhqHRo0frvvvuU2RkpLk8Ojpan3zyiTZu3KiZM2dq79696tKlixkwkpOT5eXlle/FTMHBwUpOTi7wWFOnTpXdbjensLCw4lYbAACUAcXuQRk5cqS+++47bd++3Wl5v379zJ8jIyPVpk0bhYeHa/Xq1YqJibnp/gzDuOn1qvHjx2v06NHmfHp6OiEFAIByrFg9KKNGjdJXX32lTZs23fLOm5CQEIWHh+vYsWOSJIfDoczMTKWmpjqVS0lJUXBwcIH78Pb2VvXq1Z0mAABQfhWpB8UwDI0aNUqrVq3S5s2bFRERccttLly4oDNnzigkJESS1Lp1a3l6eiohIUF9+/aVJJ07d06HDh3SjBkzinEKAKyioFuIAaA4ihRQRowYoaVLl+rLL7+Un5+fOWbEbrfLx8dHly9fVlxcnB599FGFhIQoKSlJL7/8sgIDA/XII4+YZYcMGaIxY8aoZs2aCggI0NixY9W8eXPzrh4AAFCxFSmgvPvuu5KkTp06OS1fsGCBBg8erMqVK+vgwYNavHixLl68qJCQEHXu3FnLly+Xn5+fWX7WrFny8PBQ3759zQe1LVy4kGegAAAAScW4xFMYHx8frVu37pb7qVKlit5++229/fbbRTk8AACoIHg2MgAAsBwCCgAAsBwCCgAAsBwCCgAAsJw7excyAFhI3uewJE170E01AXCn6EEBAACWQw8KgALRGwHAnQgoANyOMAQgLy7xAAAAy6EHBcBt4UWAAEoTPSgAAMBy6EEBYDn01gCgBwUAAFgOAQUAAFgOl3gAlEm3cxmooDLcwgyUDfSgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAy/FwdwUAoDTVfWm103zStAfdVBMAhaEHBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA5vMwYsKu9bdyXevAug4qAHBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA6DZIFyrqDBtgBgdfSgAAAAyyGgAAAAyyGgAAAAyylSQJk6dar+/Oc/y8/PT0FBQXr44Yf1ww8/OJUxDENxcXEKDQ2Vj4+POnXqpMOHDzuVycjI0KhRoxQYGChfX1/16dNHZ8+evfOzAQAA5UKRAsqWLVs0YsQI7d69WwkJCcrOzlb37t115coVs8yMGTMUHx+vOXPmaO/evXI4HOrWrZsuXbpklomNjdWqVau0bNkybd++XZcvX1avXr2Uk5PjujMDAABlVpHu4lm7dq3T/IIFCxQUFKTExER16NBBhmFo9uzZmjBhgmJiYiRJixYtUnBwsJYuXaqnn35aaWlpmj9/vpYsWaIHHnhAkvTxxx8rLCxMGzZsUI8ePVx0agAAoKy6ozEoaWlpkqSAgABJ0smTJ5WcnKzu3bubZby9vdWxY0ft3LlTkpSYmKisrCynMqGhoYqMjDTL5JWRkaH09HSnCQAAlF/FDiiGYWj06NG67777FBkZKUlKTk6WJAUHBzuVDQ4ONtclJyfLy8tL/v7+Ny2T19SpU2W3280pLCysuNUGAABlQLEDysiRI/Xdd9/p008/zbfOZrM5zRuGkW9ZXoWVGT9+vNLS0szpzJkzxa02AAAoA4oVUEaNGqWvvvpKmzZtUu3atc3lDodDkvL1hKSkpJi9Kg6HQ5mZmUpNTb1pmby8vb1VvXp1pwkAAJRfRQoohmFo5MiRWrlypTZu3KiIiAin9REREXI4HEpISDCXZWZmasuWLYqKipIktW7dWp6enk5lzp07p0OHDpllAABAxVaku3hGjBihpUuX6ssvv5Sfn5/ZU2K32+Xj4yObzabY2FhNmTJFDRs2VMOGDTVlyhRVrVpVjz32mFl2yJAhGjNmjGrWrKmAgACNHTtWzZs3N+/qAVB8vHsHQHlQpIDy7rvvSpI6derktHzBggUaPHiwJGncuHG6evWqhg8frtTUVLVt21br16+Xn5+fWX7WrFny8PBQ3759dfXqVXXt2lULFy5U5cqV7+xsAABAuWAzDMNwdyWKKj09XXa7XWlpaYxHQblVUE9I0rQHb1kGRZO3TQGUnKJ8f/MuHgAAYDkEFAAAYDkEFAAAYDkEFAAAYDlFuosHgHsxKNb1bmcwMoDSRw8KAACwHAIKAACwHAIKAACwHAIKAACwHAbJAsAtMJAWKH30oAAAAMshoAAAAMshoAAAAMthDAoA5MED8QD3owcFAABYDgEFAABYDgEFAABYDgEFAABYDgEFAABYDgEFAABYDgEFAABYDgEFAABYDgEFAABYDk+SBUpB3ieT8iZcACgcPSgAAMBy6EEB3KCgd73QqwIA/0YPCgAAsBx6UACL4A26APBv9KAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLKXJA2bp1q3r37q3Q0FDZbDZ98cUXTusHDx4sm83mNLVr186pTEZGhkaNGqXAwED5+vqqT58+Onv27B2dCAAAKD+KHFCuXLmili1bas6cOTct07NnT507d86c1qxZ47Q+NjZWq1at0rJly7R9+3ZdvnxZvXr1Uk5OTtHPAAAAlDseRd0gOjpa0dHRhZbx9vaWw+EocF1aWprmz5+vJUuW6IEHHpAkffzxxwoLC9OGDRvUo0ePolYJAACUMyUyBmXz5s0KCgpSo0aNNHToUKWkpJjrEhMTlZWVpe7du5vLQkNDFRkZqZ07dxa4v4yMDKWnpztNAACg/HJ5QImOjtYnn3yijRs3aubMmdq7d6+6dOmijIwMSVJycrK8vLzk7+/vtF1wcLCSk5ML3OfUqVNlt9vNKSwszNXVBgAAFlLkSzy30q9fP/PnyMhItWnTRuHh4Vq9erViYmJuup1hGLLZbAWuGz9+vEaPHm3Op6enE1IAACjHSvw245CQEIWHh+vYsWOSJIfDoczMTKWmpjqVS0lJUXBwcIH78Pb2VvXq1Z0mAABQfpV4QLlw4YLOnDmjkJAQSVLr1q3l6emphIQEs8y5c+d06NAhRUVFlXR1AABAGVDkSzyXL1/W8ePHzfmTJ0/qwIEDCggIUEBAgOLi4vToo48qJCRESUlJevnllxUYGKhHHnlEkmS32zVkyBCNGTNGNWvWVEBAgMaOHavmzZubd/UAAICKrcgBZd++fercubM5f31syKBBg/Tuu+/q4MGDWrx4sS5evKiQkBB17txZy5cvl5+fn7nNrFmz5OHhob59++rq1avq2rWrFi5cqMqVK7vglAAAQFlnMwzDcHcliio9PV12u11paWmMR0GZUPel1e6uAlwsadqD7q4CUOYU5fubd/EAAADLIaAAAADLcflzUACgIsh72Y5LPoBr0YMCAAAsh4ACAAAsh4ACAAAsh4ACAAAsh0GyAOACBT3rhoGzQPHRgwIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACzHw90VAMq6ui+tdppPmvagm2oCAOUHPSgAAMByCCgAAMByCCgAAMByCCgAAMByGCQLACWEAdRA8dGDAgAALIeAAgAALIeAAgAALIeAAgAALIdBskAhGOQIAO5BDwoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAc7uIBADfiTjGgYPSgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyylyQNm6dat69+6t0NBQ2Ww2ffHFF07rDcNQXFycQkND5ePjo06dOunw4cNOZTIyMjRq1CgFBgbK19dXffr00dmzZ+/oRACrqPvS6nwTAKBoihxQrly5opYtW2rOnDkFrp8xY4bi4+M1Z84c7d27Vw6HQ926ddOlS5fMMrGxsVq1apWWLVum7du36/Lly+rVq5dycnKKfyYAAKDcKPLLAqOjoxUdHV3gOsMwNHv2bE2YMEExMTGSpEWLFik4OFhLly7V008/rbS0NM2fP19LlizRAw88IEn6+OOPFRYWpg0bNqhHjx53cDoAAKA8cOkYlJMnTyo5OVndu3c3l3l7e6tjx47auXOnJCkxMVFZWVlOZUJDQxUZGWmWySsjI0Pp6elOEwAAKL9cGlCSk5MlScHBwU7Lg4ODzXXJycny8vKSv7//TcvkNXXqVNntdnMKCwtzZbUBAIDFlMhdPDabzWneMIx8y/IqrMz48eOVlpZmTmfOnHFZXQEAgPW4NKA4HA5JytcTkpKSYvaqOBwOZWZmKjU19aZl8vL29lb16tWdJgAAUH65NKBERETI4XAoISHBXJaZmaktW7YoKipKktS6dWt5eno6lTl37pwOHTpklgEAABVbke/iuXz5so4fP27Onzx5UgcOHFBAQIDq1Kmj2NhYTZkyRQ0bNlTDhg01ZcoUVa1aVY899pgkyW63a8iQIRozZoxq1qypgIAAjR07Vs2bNzfv6gGsimeaAEDpKHJA2bdvnzp37mzOjx49WpI0aNAgLVy4UOPGjdPVq1c1fPhwpaamqm3btlq/fr38/PzMbWbNmiUPDw/17dtXV69eVdeuXbVw4UJVrlzZBacEAADKOpthGIa7K1FU6enpstvtSktLYzwKShQ9JnClpGkP5luW9zNWUBmgvCjK9zfv4gEAAJZDQAEAAJZDQAEAAJZDQAEAAJZDQAEAAJZDQAEAAJZT5OegAACKh9vWgdtHDwoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcHnUPABZS0OPwk6Y96IaaAO5FDwoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcHnUP/L+CHjEOAHAPAgoAWFze8My7eVARcIkHAABYDj0oqBC4fAMAZQs9KAAAwHIIKAAAwHK4xAOgQElVHnOar/vHUjfVBEBFRA8KAACwHHpQAKCc4vZklGX0oAAAAMuhBwVAsdw4RoXxKQBcjR4UAABgOQQUAABgOQQUAABgOYxBAYBygNc5oLwhoAAw5X04GwC4CwEF5RJ/TQJA2cYYFAAAYDn0oABAGUMPISoCelAAAIDl0IMCoFTxlmQAt4MeFAAAYDkEFAAAYDkuDyhxcXGy2WxOk8PhMNcbhqG4uDiFhobKx8dHnTp10uHDh11dDaDCSqrymNMEAGVRiYxBadasmTZs2GDOV65c2fx5xowZio+P18KFC9WoUSNNnjxZ3bp10w8//CA/P7+SqA6AcorxLED5VSIBxcPDw6nX5DrDMDR79mxNmDBBMTExkqRFixYpODhYS5cu1dNPP10S1QHw/0rrC91VxyGAABVXiYxBOXbsmEJDQxUREaH+/fvrxIkTkqSTJ08qOTlZ3bt3N8t6e3urY8eO2rlzZ0lUBQAAlEEu70Fp27atFi9erEaNGun8+fOaPHmyoqKidPjwYSUnJ0uSgoODnbYJDg7WqVOnbrrPjIwMZWRkmPPp6emurjYAALAQlweU6Oho8+fmzZurffv2ql+/vhYtWqR27dpJkmw2m9M2hmHkW3ajqVOn6tVXX3V1VQEUAQNuAZSmEn9Qm6+vr5o3b65jx47p4YcfliQlJycrJCTELJOSkpKvV+VG48eP1+jRo8359PR0hYWFlVidgbLkVsGhPAWL8nQuAApX4s9BycjI0NGjRxUSEqKIiAg5HA4lJCSY6zMzM7VlyxZFRUXddB/e3t6qXr260wQAeXF7NVB+uLwHZezYserdu7fq1KmjlJQUTZ48Wenp6Ro0aJBsNptiY2M1ZcoUNWzYUA0bNtSUKVNUtWpVPfYYv1CAiogwAaAgLg8oZ8+e1YABA/Trr7+qVq1aateunXbv3q3w8HBJ0rhx43T16lUNHz5cqampatu2rdavX88zUAAAgMnlAWXZsmWFrrfZbIqLi1NcXJyrDw0AAMoJ3mYMVGBcXgFgVQQUABUeT6wFrIeAgjKv7kur3V2FCo+emLIr7/+fpGkPuqkmgLMSv80YAACgqOhBAdyEywoAcHMEFADIg/AIuB8BBbCIG78U+UK8c7caF0MbF6ygMV2MS4E7MAYFAABYDgEFAABYDpd4AOAWuPwGlD4CCiyN6+EAUDERUAAL4i4SABUdAQVlTll9cixPW7UW/j0Aa2OQLAAAsBx6UGApZbV3BADgWgQUoAwqa5cnylp9yyv+AEBZQkABygC+4AFUNAQUAECh8va8cKs/SgODZAEAgOUQUAAAgOVwiQcA7gAP1QNKBgEFAErIrQY3E2aAmyOgAEAR3Cp0cMcV4BqMQQEAAJZDQAEAAJbDJR6gBNHdD1e58bPk7rErBT2RlmejwNUIKADgJlYKHYDVEFAAF6LHBABcgzEoAADAcuhBAQALKOsPfON9PXA1AgqQR1G+KLikAwAlg4ACtynoTgAAACQCCkoItyECd6Y89s5xGQhFwSBZAABgOQQUAABgOVziAW6hPHa1A4DVEVBQahgUC7hGWbglmf/vuFNc4gEAAJZDQAEAAJbDJR6USzxsDQDKNgIKKgRCCMqzsjAmpSC387wknqlUcRFQ4BIMiAMAuBIBBUVWmmHkxr8My8pfhQCKjz92cB2DZAEAgOXQgwIA5VhZHZ8C0IMCAAAshx6UCo63iwIArIgeFAAAYDn0oOCW3HXXDgBrKez/Z2mObaHnt2IgoFhMRXsoEYEEcL2i/L+ySugA8iKgoNS5KpQQboCi4//Nv9ETY20ElIoszq6kKv+eLcpfS4XdushtjUD5QJiBO7l1kOzcuXMVERGhKlWqqHXr1tq2bZs7qwMAACzCbT0oy5cvV2xsrObOnat7771X7733nqKjo3XkyBHVqVPHXdUq++Lsd7R5SQyI5ZIOUP65sueUV1xAcmNAiY+P15AhQ/TUU09JkmbPnq1169bp3Xff1dSpU91VrTLrerC48ZJNUXFpBkBpcPXvmuL8YZVU5bFiX+JG6XBLQMnMzFRiYqJeeuklp+Xdu3fXzp0785XPyMhQRkaGOZ+WliZJSk9PL5H6RU5a5zR/6NUepbaf3Izf8y3Le5559+tU1mbcZu1u7TvbgH8f84/5hR7nxrLpGQJQwd34O0Fy/h1S2O8Pyfl3SEG/E290qMqQ265TYXXIzfi9wO+UAn+PT63tXGj82UK3Kcjt7Kc0FVTn4n73FeZ6GxvGbXxXGW7w008/GZKMHTt2OC1//fXXjUaNGuUrP2nSJEMSExMTExMTUzmYzpw5c8us4Na7eGw2m9O8YRj5lknS+PHjNXr0aHM+NzdXv/32m2rWrFlg+bIqPT1dYWFhOnPmjKpXr+7u6pRbtHPpoa1LB+1cOmjnO2cYhi5duqTQ0NBblnVLQAkMDFTlypWVnJzstDwlJUXBwcH5ynt7e8vb29tpWY0aNUqyim5VvXp1PvylgHYuPbR16aCdSwftfGfsdvttlXPLbcZeXl5q3bq1EhISnJYnJCQoKirKHVUCAAAW4rZLPKNHj9bAgQPVpk0btW/fXu+//75Onz6tYcOGuatKAADAItwWUPr166cLFy7otdde07lz5xQZGak1a9YoPDzcXVVyO29vb02aNCnf5Sy4Fu1cemjr0kE7lw7auXTZDON27vUBAAAoPW591D0AAEBBCCgAAMByCCgAAMByCCgAAMByCChuULduXdlsNqcp73uJTp8+rd69e8vX11eBgYF69tlnlZmZ6VTm4MGD6tixo3x8fPSnP/1Jr7322u2936CCmzt3riIiIlSlShW1bt1a27Ztc3eVyoy4uLh8n12Hw2GuNwxDcXFxCg0NlY+Pjzp16qTDhw877SMjI0OjRo1SYGCgfH191adPH5096753kFjF1q1b1bt3b4WGhspms+mLL75wWu+qtk1NTdXAgQNlt9tlt9s1cOBAXbx4sYTPzjpu1c6DBw/O9xlv166dUxnauXQQUNzk+u3V16f/+q//Mtfl5OTowQcf1JUrV7R9+3YtW7ZMn3/+ucaMGWOWSU9PV7du3RQaGqq9e/fq7bff1htvvKH4+Hh3nE6ZsXz5csXGxmrChAnav3+/7r//fkVHR+v06dPurlqZ0axZM6fP7sGDB811M2bMUHx8vObMmaO9e/fK4XCoW7duunTpklkmNjZWq1at0rJly7R9+3ZdvnxZvXr1Uk5OjjtOxzKuXLmili1bas6cOQWud1XbPvbYYzpw4IDWrl2rtWvX6sCBAxo4cGCJn59V3KqdJalnz55On/E1a9Y4raedS8mdv/oPRRUeHm7MmjXrpuvXrFljVKpUyfjpp5/MZZ9++qnh7e1tpKWlGYZhGHPnzjXsdrvxxx9/mGWmTp1qhIaGGrm5uSVW97LunnvuMYYNG+a0rEmTJsZLL73kphqVLZMmTTJatmxZ4Lrc3FzD4XAY06ZNM5f98ccfht1uN+bNm2cYhmFcvHjR8PT0NJYtW2aW+emnn4xKlSoZa9euLdG6lyWSjFWrVpnzrmrbI0eOGJKM3bt3m2V27dplSDK+//77Ej4r68nbzoZhGIMGDTIeeuihm25DO5ceelDcZPr06apZs6ZatWql119/3enyza5duxQZGen0MqUePXooIyNDiYmJZpmOHTs6PTCoR48e+vnnn5WUlFRq51GWZGZmKjExUd27d3da3r17d+3cudNNtSp7jh07ptDQUEVERKh///46ceKEJOnkyZNKTk52al9vb2917NjRbN/ExERlZWU5lQkNDVVkZCT/BoVwVdvu2rVLdrtdbdu2Ncu0a9dOdrud9r/B5s2bFRQUpEaNGmno0KFKSUkx19HOpcetbzOuqJ577jndfffd8vf31549ezR+/HidPHlSH374oSQpOTk530sT/f395eXlZb5gMTk5WXXr1nUqc32b5ORkRURElPyJlDG//vqrcnJy8rVtcHBwvhdXomBt27bV4sWL1ahRI50/f16TJ09WVFSUDh8+bLZhQe176tQpSdc+m15eXvL3989Xhn+Dm3NV2yYnJysoKCjf/oOCgmj//xcdHa2//vWvCg8P18mTJzVx4kR16dJFiYmJ8vb2pp1LEQHFReLi4vTqq68WWmbv3r1q06aNnn/+eXNZixYt5O/vr//4j/8we1UkyWaz5dveMAyn5XnLGP8/QLagbfFvBbUbbXZ7oqOjzZ+bN2+u9u3bq379+lq0aJE5kLA47cu/we1xRdvezu+Wiqxfv37mz5GRkWrTpo3Cw8O1evVqxcTE3HQ72tn1uMTjIiNHjtTRo0cLnSIjIwvc9vov9uPHj0uSHA5HvpSdmpqqrKws8y+ogspc74bM+1cWrgkMDFTlypULbDfarHh8fX3VvHlzHTt2zLybp7D2dTgcyszMVGpq6k3LID9Xta3D4dD58+fz7f+XX36h/W8iJCRE4eHhOnbsmCTauTQRUFwkMDBQTZo0KXSqUqVKgdvu379f0rX/CJLUvn17HTp0SOfOnTPLrF+/Xt7e3mrdurVZZuvWrU5jV9avX6/Q0NB8l35wjZeXl1q3bq2EhASn5QkJCYqKinJTrcq2jIwMHT16VCEhIYqIiJDD4XBq38zMTG3ZssVs39atW8vT09OpzLlz53To0CH+DQrhqrZt37690tLStGfPHrPMN998o7S0NNr/Ji5cuKAzZ86Yv59p51LkrtG5FdXOnTuN+Ph4Y//+/caJEyeM5cuXG6GhoUafPn3MMtnZ2UZkZKTRtWtX49tvvzU2bNhg1K5d2xg5cqRZ5uLFi0ZwcLAxYMAA4+DBg8bKlSuN6tWrG2+88YY7TqvMWLZsmeHp6WnMnz/fOHLkiBEbG2v4+voaSUlJ7q5amTBmzBhj8+bNxokTJ4zdu3cbvXr1Mvz8/Mz2mzZtmmG3242VK1caBw8eNAYMGGCEhIQY6enp5j6GDRtm1K5d29iwYYPx7bffGl26dDFatmxpZGdnu+u0LOHSpUvG/v37jf379xuSzN8Tp06dMgzDdW3bs2dPo0WLFsauXbuMXbt2Gc2bNzd69epV6ufrLoW186VLl4wxY8YYO3fuNE6ePGls2rTJaN++vfGnP/2JdnYDAkopS0xMNNq2bWvY7XajSpUqRuPGjY1JkyYZV65ccSp36tQp48EHHzR8fHyMgIAAY+TIkU63FBuGYXz33XfG/fffb3h7exsOh8OIi4vjFuPb8M477xjh4eGGl5eXcffddxtbtmxxd5XKjH79+hkhISGGp6enERoaasTExBiHDx821+fm5hqTJk0yHA6H4e3tbXTo0ME4ePCg0z6uXr1qjBw50ggICDB8fHyMXr16GadPny7tU7GcTZs2GZLyTYMGDTIMw3Vte+HCBeNvf/ub4efnZ/j5+Rl/+9vfjNTU1FI6S/crrJ1///13o3v37katWrUMT09Po06dOsagQYPytSHtXDpshsGjRwEAgLUwBgUAAFgOAQUAAFgOAQUAAFgOAQUAAFgOAQUAAFgOAQUAAFgOAQUAAFgOAQUAAFgOAQUAAFgOAQUAAFgOAQUAAFgOAQUAAFjO/wHim9be76qbVAAAAABJRU5ErkJggg==",
105 |       "text/plain": [
106 |        "<Figure size 640x480 with 1 Axes>"
107 |       ]
108 |      },
109 |      "metadata": {},
110 |      "output_type": "display_data"
111 |     }
112 |    ],
113 |    "source": [
114 |     "# Randomly choose 2000 indices\n",
115 |     "indices_to_remove = np.random.choice(gaussian_array.size, 2000, replace=False)\n",
116 |     "\n",
117 |     "# Get the samples corresponding to the chosen indices\n",
118 |     "random_samples = gaussian_array[indices_to_remove]\n",
119 |     "\n",
120 |     "# Remove the selected samples from the original array\n",
121 |     "remaining_arr = np.delete(gaussian_array, indices_to_remove)\n",
122 |     "\n",
123 |     "plt.hist(remaining_arr, bins=100, label='remaining array');\n",
124 |     "plt.hist(random_samples, bins=100, label='random sampled');\n",
125 |     "plt.legend()\n",
126 |     "\n",
127 |     "print(\"Remaining array shape:\", remaining_arr.shape)\n",
128 |     "print(\"Random samples shape:\", random_samples.shape)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "id": "8dd8f9f6-4673-4baf-9f80-ae14af502c48",
134 |    "metadata": {},
135 |    "source": [
136 |     "#### Let's scale it!"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 5,
142 |    "id": "37c21018-07cd-4e74-9656-cd5f7919eec9",
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "scaler = QuantileTransformer(n_quantiles = 74)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 6,
152 |    "id": "085f3dd5-704d-4a64-88c2-e495faf8e5a4",
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "arr_transformed = scaler.fit_transform(remaining_arr.reshape(-1, 1))"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 7,
162 |    "id": "e77ca417-db7b-4b86-856e-8ca0a5b28021",
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "filename = \"quantile_scaler.json\""
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 8,
172 |    "id": "2a769941-850d-4379-a00a-4d6c7a60d4d1",
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "# save to json file\n",
177 |     "save_scaler(scaler, filename)\n",
178 |     "# create new StandardScaler from json file information.\n",
179 |     "new_scaler = load_scaler(filename)\n",
180 |     "# new_scaler is a StandardScaler object"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "id": "f4db109a-22ee-473f-a055-bc30645da34c",
186 |    "metadata": {},
187 |    "source": [
188 |     "Let's load those parameters, and fit on the subsampled dataset from above:"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 9,
194 |    "id": "8cbd215e-57eb-403a-8759-b0b8d8a04462",
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "random_transformed = new_scaler.fit_transform(random_samples.reshape(-1,1))"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "id": "857b3868-7082-49e1-9cbe-4da46fd59d32",
204 |    "metadata": {},
205 |    "source": [
206 |     "We can also inverse transform to get close to the original array. The number of quantiles will have a large impact on this."
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 10,
212 |    "id": "0bd88fdf-3ad5-4f69-b564-26676be298ec",
213 |    "metadata": {},
214 |    "outputs": [
215 |     {
216 |      "data": {
217 |       "text/plain": [
218 |        "array([[386.35688039],\n",
219 |        "       [586.64628448],\n",
220 |        "       [155.9777724 ],\n",
221 |        "       ...,\n",
222 |        "       [296.04124775],\n",
223 |        "       [ 50.26756938],\n",
224 |        "       [457.81957258]])"
225 |       ]
226 |      },
227 |      "execution_count": 10,
228 |      "metadata": {},
229 |      "output_type": "execute_result"
230 |     }
231 |    ],
232 |    "source": [
233 |     "inverted_random = new_scaler.inverse_transform(random_transformed)\n",
234 |     "inverted_random"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "id": "3da9bf26-2e80-4345-8f9f-74dd136d42b5",
240 |    "metadata": {},
241 |    "source": [
242 |     "Let's see how far apart the medians are:"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 11,
248 |    "id": "0a398609-b80d-4cd3-acf5-dc3bd0bf5b1e",
249 |    "metadata": {},
250 |    "outputs": [
251 |     {
252 |      "data": {
253 |       "text/plain": [
254 |        "0.3066"
255 |       ]
256 |      },
257 |      "execution_count": 11,
258 |      "metadata": {},
259 |      "output_type": "execute_result"
260 |     }
261 |    ],
262 |    "source": [
263 |     "median_random = np.median(random_transformed)\n",
264 |     "median_arr = np.median(arr_transformed)\n",
265 |     "percentage_difference = np.round(np.abs(median_random - median_arr) / median_random * 100, 4)\n",
266 |     "percentage_difference"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "markdown",
271 |    "id": "b6799f65-f898-4357-aef2-e9283e9ecc73",
272 |    "metadata": {},
273 |    "source": [
274 |     "## Pandas Example"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "id": "0d328fee-9ad4-4245-8e33-6ef53c7213e4",
280 |    "metadata": {},
281 |    "source": [
282 |     "This is using a [Group Scaler](https://bridgescaler.readthedocs.io/en/latest/group.html). "
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 12,
288 |    "id": "89a78409-d918-427e-8321-cc2e3b277dc9",
289 |    "metadata": {},
290 |    "outputs": [
291 |     {
292 |      "data": {
293 |       "text/html": [
294 |        "<div>\n",
295 |        "<style scoped>\n",
296 |        "    .dataframe tbody tr th:only-of-type {\n",
297 |        "        vertical-align: middle;\n",
298 |        "    }\n",
299 |        "\n",
300 |        "    .dataframe tbody tr th {\n",
301 |        "        vertical-align: top;\n",
302 |        "    }\n",
303 |        "\n",
304 |        "    .dataframe thead th {\n",
305 |        "        text-align: right;\n",
306 |        "    }\n",
307 |        "</style>\n",
308 |        "<table border=\"1\" class=\"dataframe\">\n",
309 |        "  <thead>\n",
310 |        "    <tr style=\"text-align: right;\">\n",
311 |        "      <th></th>\n",
312 |        "      <th>a</th>\n",
313 |        "      <th>b</th>\n",
314 |        "      <th>c</th>\n",
315 |        "      <th>d</th>\n",
316 |        "      <th>e</th>\n",
317 |        "    </tr>\n",
318 |        "  </thead>\n",
319 |        "  <tbody>\n",
320 |        "    <tr>\n",
321 |        "      <th>count</th>\n",
322 |        "      <td>1000.000000</td>\n",
323 |        "      <td>1000.000000</td>\n",
324 |        "      <td>1000.000000</td>\n",
325 |        "      <td>1000.000000</td>\n",
326 |        "      <td>1000.000000</td>\n",
327 |        "    </tr>\n",
328 |        "    <tr>\n",
329 |        "      <th>mean</th>\n",
330 |        "      <td>0.493160</td>\n",
331 |        "      <td>0.508249</td>\n",
332 |        "      <td>0.482308</td>\n",
333 |        "      <td>0.500310</td>\n",
334 |        "      <td>0.490955</td>\n",
335 |        "    </tr>\n",
336 |        "    <tr>\n",
337 |        "      <th>std</th>\n",
338 |        "      <td>0.287404</td>\n",
339 |        "      <td>0.290955</td>\n",
340 |        "      <td>0.283399</td>\n",
341 |        "      <td>0.286028</td>\n",
342 |        "      <td>0.289448</td>\n",
343 |        "    </tr>\n",
344 |        "    <tr>\n",
345 |        "      <th>min</th>\n",
346 |        "      <td>0.000282</td>\n",
347 |        "      <td>0.000466</td>\n",
348 |        "      <td>0.000220</td>\n",
349 |        "      <td>0.001726</td>\n",
350 |        "      <td>0.002557</td>\n",
351 |        "    </tr>\n",
352 |        "    <tr>\n",
353 |        "      <th>25%</th>\n",
354 |        "      <td>0.239252</td>\n",
355 |        "      <td>0.254053</td>\n",
356 |        "      <td>0.234961</td>\n",
357 |        "      <td>0.260231</td>\n",
358 |        "      <td>0.238156</td>\n",
359 |        "    </tr>\n",
360 |        "    <tr>\n",
361 |        "      <th>50%</th>\n",
362 |        "      <td>0.489762</td>\n",
363 |        "      <td>0.528342</td>\n",
364 |        "      <td>0.479333</td>\n",
365 |        "      <td>0.498497</td>\n",
366 |        "      <td>0.489239</td>\n",
367 |        "    </tr>\n",
368 |        "    <tr>\n",
369 |        "      <th>75%</th>\n",
370 |        "      <td>0.747075</td>\n",
371 |        "      <td>0.764794</td>\n",
372 |        "      <td>0.714280</td>\n",
373 |        "      <td>0.757906</td>\n",
374 |        "      <td>0.743647</td>\n",
375 |        "    </tr>\n",
376 |        "    <tr>\n",
377 |        "      <th>max</th>\n",
378 |        "      <td>0.999566</td>\n",
379 |        "      <td>0.998453</td>\n",
380 |        "      <td>0.999272</td>\n",
381 |        "      <td>0.997966</td>\n",
382 |        "      <td>0.999053</td>\n",
383 |        "    </tr>\n",
384 |        "  </tbody>\n",
385 |        "</table>\n",
386 |        "</div>"
387 |       ],
388 |       "text/plain": [
389 |        "                 a            b            c            d            e\n",
390 |        "count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000\n",
391 |        "mean      0.493160     0.508249     0.482308     0.500310     0.490955\n",
392 |        "std       0.287404     0.290955     0.283399     0.286028     0.289448\n",
393 |        "min       0.000282     0.000466     0.000220     0.001726     0.002557\n",
394 |        "25%       0.239252     0.254053     0.234961     0.260231     0.238156\n",
395 |        "50%       0.489762     0.528342     0.479333     0.498497     0.489239\n",
396 |        "75%       0.747075     0.764794     0.714280     0.757906     0.743647\n",
397 |        "max       0.999566     0.998453     0.999272     0.997966     0.999053"
398 |       ]
399 |      },
400 |      "execution_count": 12,
401 |      "metadata": {},
402 |      "output_type": "execute_result"
403 |     }
404 |    ],
405 |    "source": [
406 |     "x_rand = np.random.random(size=(1000, 5))\n",
407 |     "data = pd.DataFrame(data=x_rand,\n",
408 |     "                    columns=[\"a\", \"b\", \"c\", \"d\", \"e\"])\n",
409 |     "data.describe()"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": 13,
415 |    "id": "f2366229-634e-463f-b989-efd216efb58a",
416 |    "metadata": {},
417 |    "outputs": [],
418 |    "source": [
419 |     "groups = [[\"a\", \"b\"], [\"c\", \"d\"], \"e\"]\n",
420 |     "group_scaler = GroupStandardScaler()\n",
421 |     "x_transformed = group_scaler.fit_transform(data, groups=groups)"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "markdown",
426 |    "id": "5c565379-6bfb-4b41-b0fe-0f6cf3907aec",
427 |    "metadata": {},
428 |    "source": [
429 |     "We can save out this file like the above:"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": 14,
435 |    "id": "6d04a3af-7fe1-42de-b9b6-793b82870bea",
436 |    "metadata": {},
437 |    "outputs": [],
438 |    "source": [
439 |     "# save to json file\n",
440 |     "save_scaler(group_scaler, 'group_scaler.json')"
441 |    ]
442 |   }
443 |  ],
444 |  "metadata": {
445 |   "kernelspec": {
446 |    "display_name": "Python [conda env:unidata-cpu]",
447 |    "language": "python",
448 |    "name": "conda-env-unidata-cpu-py"
449 |   },
450 |   "language_info": {
451 |    "codemirror_mode": {
452 |     "name": "ipython",
453 |     "version": 3
454 |    },
455 |    "file_extension": ".py",
456 |    "mimetype": "text/x-python",
457 |    "name": "python",
458 |    "nbconvert_exporter": "python",
459 |    "pygments_lexer": "ipython3",
460 |    "version": "3.10.0"
461 |   }
462 |  },
463 |  "nbformat": 4,
464 |  "nbformat_minor": 5
465 | }
466 | 


--------------------------------------------------------------------------------
/bridgescaler/distributed.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from numpy.lib.recfunctions import structured_to_unstructured, unstructured_to_structured
  3 | from copy import deepcopy
  4 | from crick import TDigest as CTDigest
  5 | from scipy.special import ndtr, ndtri
  6 | import pandas as pd
  7 | import xarray as xr
  8 | from functools import partial
  9 | from scipy.stats import logistic
 10 | from numba import guvectorize, float32, float64, void
 11 | CENTROID_DTYPE = np.dtype([('mean', np.float64), ('weight', np.float64)])
 12 | 
 13 | 
 14 | class DBaseScaler(object):
 15 |     """
 16 |     Base distributed scaler class. Used only to store attributes and methods shared across all distributed
 17 |     scaler subclasses.
 18 |     """
 19 |     def __init__(self, channels_last=True):
 20 |         self.x_columns_ = None
 21 |         self.is_array_ = False
 22 |         self._fit = False
 23 |         self.channels_last = channels_last
 24 | 
 25 |     def is_fit(self):
 26 |         return self._fit
 27 | 
 28 |     @staticmethod
 29 |     def extract_x_columns(x, channels_last=True):
 30 |         """
 31 |         Extract the variable names to be transformed from x depending on if x is a pandas DataFrame, an
 32 |         xarray DataArray, or a numpy array. All of these assume that the columns are in the last dimension.
 33 |         If x is an xarray DataArray, there should be a coorindate variable with the same name as the last dimension
 34 |         of the DataArray being transformed.
 35 | 
 36 |         Args:
 37 |             x (Union[pandas.DataFrame, xarray.DataArray, numpy.ndarray]): array of values to be transformed.
 38 |             channels_last (bool): If True, then assume the variable or channel dimension is the last dimension of the
 39 |                 array. If False, then assume the variable or channel dimension is second.
 40 | 
 41 |         Returns:
 42 |             xv (numpy.ndarray): Array of values to be transformed.
 43 |             is_array (bool): Whether or not x was a np.ndarray.
 44 |         """
 45 |         is_array = False
 46 |         var_dim_num = -1
 47 |         if not channels_last:
 48 |             var_dim_num = 1
 49 |         if hasattr(x, "columns"):
 50 |             x_columns = x.columns.values
 51 |         elif hasattr(x, "coords"):
 52 |             var_dim = x.dims[var_dim_num]
 53 |             x_columns = x.coords[var_dim].values
 54 |         else:
 55 |             x_columns = np.arange(x.shape[var_dim_num])
 56 |             is_array = True
 57 |         return x_columns, is_array
 58 | 
 59 |     @staticmethod
 60 |     def extract_array(x):
 61 |         if hasattr(x, "columns") or hasattr(x, "coords"):
 62 |             xv = x.values
 63 |         else:
 64 |             xv = x
 65 |         return xv
 66 | 
 67 |     def get_column_order(self, x_in_columns):
 68 |         """
 69 |         Get the indices of the scaler columns that have the same name as the columns in the input x array. This
 70 |         enables users to pass a DataFrame or DataArray to transform or inverse_transform with fewer columns than
 71 |         the original scaler or columns in a different order and still have the input dataset be transformed properly.
 72 | 
 73 |         Args:
 74 |             x_in_columns (Union[list, numpy.ndarray]): list of input columns.
 75 | 
 76 |         Returns:
 77 |             x_in_col_indices (np.ndarray): indices of the input columns from x in the scaler in order.
 78 |         """
 79 |         assert np.all(np.isin(x_in_columns, self.x_columns_)), "Some input columns not in scaler x_columns."
 80 |         x_in_col_indices = np.array([np.where(col == np.array(self.x_columns_))[0][0] for col in x_in_columns])
 81 |         return x_in_col_indices
 82 |     
 83 |     @staticmethod
 84 |     def package_transformed_x(x_transformed, x):
 85 |         """
 86 |         Repackaged a transformed numpy array into the same datatype as the original x, including
 87 |         all metadata.
 88 | 
 89 |         Args:
 90 |             x_transformed (numpy.ndarray): array after being transformed or inverse transformed
 91 |             x (Union[pandas.DataFrame, xarray.DataArray, numpy.ndarray]):
 92 | 
 93 |         Returns:
 94 | 
 95 |         """
 96 |         if hasattr(x, "columns"):
 97 |             x_packaged = pd.DataFrame(x_transformed, index=x.index, columns=x.columns)
 98 |         elif hasattr(x, "coords"):
 99 |             x_packaged = xr.DataArray(x_transformed, coords=x.coords, dims=x.dims, attrs=x.attrs, name=x.name)
100 |         else:
101 |             x_packaged = x_transformed
102 |         return x_packaged
103 | 
104 |     def set_channel_dim(self, channels_last=None):
105 |         if channels_last is None:
106 |             channels_last = self.channels_last
107 |         if channels_last:
108 |             channel_dim = -1
109 |         else:
110 |             channel_dim = 1
111 |         return channel_dim
112 | 
113 |     def process_x_for_transform(self, x, channels_last=None):
114 |         if channels_last is None:
115 |             channels_last = self.channels_last
116 |         channel_dim = self.set_channel_dim(channels_last)
117 |         assert self._fit, "Scaler has not been fit."
118 |         x_in_cols, is_array = self.extract_x_columns(x, channels_last=channels_last)
119 |         if is_array:
120 |             assert x.shape[channel_dim] == self.x_columns_.shape[0], "Number of input columns does not match scaler."
121 |             x_col_order = np.arange(x.shape[channel_dim])
122 |         else:
123 |             x_col_order = self.get_column_order(x_in_cols)
124 |         xv = self.extract_array(x)
125 |         x_transformed = np.zeros(xv.shape, dtype=xv.dtype)
126 |         return xv, x_transformed, channels_last, channel_dim, x_col_order
127 | 
128 |     def fit(self, x, weight=None):
129 |         pass
130 | 
131 |     def transform(self, x, channels_last=None):
132 |         pass
133 | 
134 |     def fit_transform(self, x, channels_last=None, weight=None):
135 |         self.fit(x, weight=weight)
136 |         return self.transform(x, channels_last=channels_last)
137 | 
138 |     def inverse_transform(self, x, channels_last=None):
139 |         pass
140 | 
141 |     def __add__(self, other):
142 |         pass
143 | 
144 |     def subset_columns(self, sel_columns):
145 |         pass
146 | 
147 |     def add_variables(self, other):
148 |         pass
149 | 
150 | 
151 | class DStandardScaler(DBaseScaler):
152 |     """
153 |     Distributed version of StandardScaler. You can calculate this map-reduce style by running it on individual
154 |     data files, return the fitted objects, and then sum them together to represent the full dataset. Scaler
155 |     supports numpy arrays, pandas dataframes, and xarray DataArrays and will return a transformed array in the
156 |     same form as the original with column or coordinate names preserved.
157 | 
158 |     """
159 |     def __init__(self, channels_last=True):
160 |         self.mean_x_ = None
161 |         self.n_ = 0
162 |         self.var_x_ = None
163 |         super().__init__(channels_last=channels_last)
164 | 
165 |     def fit(self, x, weight=None):
166 |         x_columns, is_array = self.extract_x_columns(x, channels_last=self.channels_last)
167 |         xv = self.extract_array(x)
168 |         channel_dim = self.set_channel_dim()
169 |         if not self._fit:
170 |             self.x_columns_ = x_columns
171 |             self.is_array_ = is_array
172 |             if len(xv.shape) > 2:
173 |                 if self.channels_last:
174 |                     self.n_ += np.prod(xv.shape[:-1])
175 |                 else:
176 |                     self.n_ += xv.shape[0] * np.prod(xv.shape[2:])
177 |             else:
178 |                 self.n_ += xv.shape[0]
179 |             self.mean_x_ = np.zeros(xv.shape[channel_dim], dtype=xv.dtype)
180 |             self.var_x_ = np.zeros(xv.shape[channel_dim], dtype=xv.dtype)
181 |             if self.channels_last:
182 |                 for i in range(xv.shape[channel_dim]):
183 |                     self.mean_x_[i] = np.nanmean(xv[..., i])
184 |                     self.var_x_[i] = np.nanvar(xv[..., i])
185 |             else:
186 |                 for i in range(xv.shape[channel_dim]):
187 |                     self.mean_x_[i] = np.nanmean(xv[:, i])
188 |                     self.var_x_[i] = np.nanvar(xv[:, i])
189 | 
190 |         else:
191 |             assert x.shape[channel_dim] == self.x_columns_.shape[0], "New data has a different number of columns"
192 |             if is_array:
193 |                 if self.channels_last:
194 |                     x_col_order = np.arange(x.shape[-1])
195 |                 else:
196 |                     x_col_order = np.arange(x.shape[1])
197 |             else:
198 |                 x_col_order = self.get_column_order(x_columns)
199 |             # update derived from
200 |             # https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
201 |             if len(xv.shape) > 2:
202 |                 if self.channels_last:
203 |                     new_n = np.prod(xv.shape[:-1])
204 |                 else:
205 |                     new_n = xv.shape[0] * np.prod(xv.shape[2:])
206 |             else:
207 |                 new_n = xv.shape[0]
208 |             for i, o in enumerate(x_col_order):
209 |                 if self.channels_last:
210 |                     new_mean = np.nanmean(xv[..., i])
211 |                     new_var = np.nanvar(xv[..., i])
212 |                 else:
213 |                     new_mean = np.nanmean(xv[:, i])
214 |                     new_var = np.nanvar(xv[:, i])
215 |                 combined_mean = (self.n_ * self.mean_x_[o] + new_n * new_mean) / (self.n_ + new_n)
216 |                 weighted_var = (self.n_ * self.var_x_[o] + new_n * new_var) / (self.n_ + new_n)
217 |                 var_correction = (self.n_ * new_n * (self.mean_x_[o] - new_mean) ** 2) / (
218 |                         (self.n_ + new_n) ** 2)
219 |                 self.mean_x_[o] = combined_mean
220 |                 self.var_x_[o] = weighted_var + var_correction
221 |             self.n_ += new_n
222 |         self._fit = True
223 | 
224 |     def transform(self, x, channels_last=None):
225 |         """
226 |         Transform the input data from its original form to standard scaled form. If your input data has a
227 |         different dimension order than the data used to fit the scaler, use the channels_last keyword argument
228 |         to specify whether the new data are `channels_last` (True) or `channels_first` (False).
229 | 
230 |         Args:
231 |             x: Input data.
232 |             channels_last: Override the default channels_last parameter of the scaler.
233 | 
234 |         Returns:
235 |             x_transformed: Transformed data in the same shape and type as x.
236 |         """
237 |         xv, x_transformed, channels_last, channel_dim, x_col_order = self.process_x_for_transform(x, channels_last)
238 |         x_mean, x_var = self.get_scales()
239 |         if channels_last:
240 |             for i, o in enumerate(x_col_order):
241 |                 x_transformed[..., i] = (xv[..., i] - x_mean[o]) / np.sqrt(x_var[o])
242 |         else:
243 |             for i, o in enumerate(x_col_order):
244 |                 x_transformed[:, i] = (xv[:, i] - x_mean[o]) / np.sqrt(x_var[o])
245 |         x_transformed_final = self.package_transformed_x(x_transformed, x)
246 |         return x_transformed_final
247 | 
248 |     def inverse_transform(self, x, channels_last=None):
249 |         xv, x_transformed, channels_last, channel_dim, x_col_order = self.process_x_for_transform(x, channels_last)
250 |         x_mean, x_var = self.get_scales()
251 |         if channels_last:
252 |             for i, o in enumerate(x_col_order):
253 |                 x_transformed[..., i] = xv[..., i] * np.sqrt(x_var[o]) + x_mean[o]
254 |         else:
255 |             for i, o in enumerate(x_col_order):
256 |                 x_transformed[:, i] = xv[:, i] * np.sqrt(x_var[o]) + x_mean[o]
257 |         x_transformed = self.package_transformed_x(x_transformed, x)
258 |         return x_transformed
259 | 
260 |     def get_scales(self):
261 |         return self.mean_x_, self.var_x_
262 | 
263 |     def __add__(self, other):
264 |         assert type(other) is DStandardScaler, "Input is not DStandardScaler"
265 |         assert np.all(other.x_columns_ == self.x_columns_), "Scaler columns do not match."
266 |         current = deepcopy(self)
267 |         current.mean_x_ = (self.n_ * self.mean_x_ + other.n_ * other.mean_x_) / (self.n_ + other.n_)
268 |         combined_var = (self.n_ * self.var_x_ + other.n_ * other.var_x_) / (self.n_ + other.n_)
269 |         combined_var_corr = (self.n_ * other.n_ * (self.mean_x_ - other.mean_x_) ** 2) / (
270 |             (self.n_ + other.n_) ** 2)
271 |         current.var_x_ = combined_var + combined_var_corr
272 |         current.n_ = self.n_ + other.n_
273 |         return current
274 | 
275 | 
276 | class DMinMaxScaler(DBaseScaler):
277 |     """
278 |     Distributed MinMaxScaler enables calculation of min and max of variables in datasets in parallel then combining
279 |     the mins and maxes as a reduction step. Scaler
280 |     supports numpy arrays, pandas dataframes, and xarray DataArrays and will return a transformed array in the
281 |     same form as the original with column or coordinate names preserved.
282 | 
283 |     """
284 |     def __init__(self, channels_last=True):
285 |         self.max_x_ = None
286 |         self.min_x_ = None
287 |         super().__init__(channels_last=channels_last)
288 | 
289 |     def fit(self, x, weight=None):
290 |         x_columns, is_array = self.extract_x_columns(x, channels_last=self.channels_last)
291 |         xv = self.extract_array(x)
292 |         channel_dim = self.set_channel_dim()
293 |         if not self._fit:
294 |             self.x_columns_ = x_columns
295 |             self.is_array_ = is_array
296 |             self.max_x_ = np.zeros(xv.shape[channel_dim])
297 |             self.min_x_ = np.zeros(xv.shape[channel_dim])
298 |             if self.channels_last:
299 |                 for i in range(xv.shape[channel_dim]):
300 |                     self.max_x_[i] = np.nanmax(xv[..., i])
301 |                     self.min_x_[i] = np.nanmin(xv[..., i])
302 |             else:
303 |                 for i in range(xv.shape[channel_dim]):
304 |                     self.max_x_[i] = np.nanmax(xv[:, i])
305 |                     self.min_x_[i] = np.nanmin(xv[:, i])
306 |         else:
307 |             assert x.shape[channel_dim] == self.x_columns_.shape[0], "New data has a different number of columns"
308 |             if is_array:
309 |                 if self.channels_last:
310 |                     x_col_order = np.arange(x.shape[-1])
311 |                 else:
312 |                     x_col_order = np.arange(x.shape[1])
313 |             else:
314 |                 x_col_order = self.get_column_order(x_columns)
315 |             if self.channels_last:
316 |                 for i, o in enumerate(x_col_order):
317 |                     self.max_x_[o] = np.maximum(self.max_x_[o], np.nanmax(xv[..., i]))
318 |                     self.min_x_[o] = np.minimum(self.min_x_[o], np.nanmin(xv[..., i]))
319 |             else:
320 |                 for i, o in enumerate(xv.shape[channel_dim]):
321 |                     self.max_x_[o] = np.maximum(self.max_x_[o], np.nanmax(xv[:, i]))
322 |                     self.min_x_[o] = np.minimum(self.min_x_[o], np.nanmin(xv[:, i]))
323 |         self._fit = True
324 | 
325 |     def transform(self, x, channels_last=None):
326 |         xv, x_transformed, channels_last, channel_dim, x_col_order = self.process_x_for_transform(x, channels_last)
327 |         if channels_last:
328 |             for i, o in enumerate(x_col_order):
329 |                 x_transformed[..., i] = (xv[..., i] - self.min_x_[o]) / (
330 |                         self.max_x_[o] - self.min_x_[o])
331 |         else:
332 |             for i, o in enumerate(x_col_order):
333 |                 x_transformed[:, i] = (xv[:, i] - self.min_x_[o]) / (
334 |                         self.max_x_[o] - self.min_x_[o])
335 |         x_transformed = self.package_transformed_x(x_transformed, x)
336 |         return x_transformed
337 | 
338 |     def inverse_transform(self, x, channels_last=None):
339 |         xv, x_transformed, channels_last, channel_dim, x_col_order = self.process_x_for_transform(x, channels_last)
340 |         if channels_last:
341 |             for i, o in enumerate(x_col_order):
342 |                 x_transformed[..., i] = xv[..., i] * (self.max_x_[o] -
343 |                                                       self.min_x_[o]) + self.min_x_[o]
344 |         else:
345 |             for i, o in enumerate(x_col_order):
346 |                 x_transformed[:, i] = xv[:, i] * (self.max_x_[o] -
347 |                                                   self.min_x_[o]) + self.min_x_[o]
348 |         x_transformed = self.package_transformed_x(x_transformed, x)
349 |         return x_transformed
350 | 
351 |     def get_scales(self):
352 |         return self.min_x_, self.max_x_
353 | 
354 |     def __add__(self, other):
355 |         assert type(other) is DMinMaxScaler, "Input is not DMinMaxScaler"
356 |         assert np.all(other.x_columns_ == self.x_columns_), "Scaler columns do not match."
357 |         current = deepcopy(self)
358 |         current.max_x_ = np.maximum(self.max_x_, other.max_x_)
359 |         current.min_x_ = np.minimum(self.min_x_, other.min_x_)
360 |         return current
361 | 
362 | 
363 | def fit_variable(var_index, xv_shared=None, compression=None, channels_last=None):
364 |     xv = xv_shared
365 |     td_obj = CTDigest(compression=compression)
366 |     if channels_last:
367 |         td_obj.update(xv[..., var_index].ravel())
368 |     else:
369 |         td_obj.update(xv[:, var_index].ravel())
370 |     return td_obj
371 | 
372 | 
373 | def transform_variable(td_obj, xv,
374 |                        min_val=0.000001, max_val=0.9999999, distribution="normal"):
375 |     td_centroids = td_obj.centroids()
376 |     x_transformed = np.zeros_like(xv)
377 |     tdigest_cdf(xv, td_centroids["mean"], td_centroids["weight"],
378 |                                 td_obj.min(), td_obj.max(), x_transformed)
379 |     x_transformed = np.minimum(x_transformed, max_val)
380 |     x_transformed = np.maximum(x_transformed, min_val)
381 |     if distribution == "normal":
382 |         x_transformed = ndtri(x_transformed)
383 |     elif distribution == "logistic":
384 |         x_transformed = logistic.ppf(x_transformed)
385 |     return x_transformed
386 | 
387 | 
388 | def inv_transform_variable(td_obj, xv,
389 |                            distribution="normal"):
390 |     td_centroids = td_obj.centroids()
391 |     x_transformed = np.zeros_like(xv)
392 |     if distribution == "normal":
393 |         x_intermediate = ndtr(xv)
394 |     elif distribution == "logistic":
395 |         x_intermediate = logistic.cdf(xv)
396 |     else:
397 |         x_intermediate = xv
398 |     tdigest_quantile(x_intermediate, td_centroids["mean"], td_centroids["weight"],
399 |                                 td_obj.min(), td_obj.max(), x_transformed)
400 |     return x_transformed
401 | 
402 | 
403 | @guvectorize([void(float64[:], float64[:], float64[:], float64, float64, float64[:]),
404 |             void(float32[:], float64[:], float64[:], float64, float64, float32[:])], "(m),(n),(n),(),()->(m)")
405 | def tdigest_cdf(xv, cent_mean, cent_weight, t_min, t_max, out):
406 |     cent_merged_weight = np.zeros_like(cent_weight)
407 |     cumulative_weight = 0
408 |     for i in range(cent_weight.size):
409 |         cent_merged_weight[i] = cumulative_weight + cent_weight[i] / 2.0
410 |         cumulative_weight += cent_weight[i]
411 |     total_weight = cent_weight.sum()
412 |     for i, x in enumerate(xv):
413 |         if cent_mean.size == 0:
414 |             out[i] = np.nan
415 |             continue
416 |         # Single centroid
417 |         if cent_mean.size == 1:
418 |             if x < t_min:
419 |                 out[i] = 0.0
420 |             elif x > t_max:
421 |                 out[i] = 1.0
422 |             elif t_max - t_min < np.finfo(np.float64).eps:
423 |                 out[i] = 0.5
424 |             else:
425 |                 out[i] = (x - t_min) / (t_max - t_min)
426 |             continue
427 |         # Equality checks only apply if > 1 centroid
428 |         if x >= t_max:
429 |             out[i] = 1.0
430 |             continue
431 |         elif x <= t_min:
432 |             out[i] = 0.0
433 |             continue
434 | 
435 |         # i_l = bisect_left_mean(T->merge_centroids, x, 0, T->ncentroids);
436 |         i_l = np.searchsorted(cent_mean, x, side="left")
437 |         if x < cent_mean[0]:
438 |             # min < x < first centroid
439 |             x0 = t_min
440 |             x1 = cent_mean[0]
441 |             dw = cent_merged_weight[0] / 2.0
442 |             out[i] = dw * (x - x0) / (x1 - x0) / total_weight
443 |         elif i_l == cent_mean.size:
444 |             # last centroid < x < max
445 |             x0 = cent_mean[i_l - 1]
446 |             x1 = t_max
447 |             dw = cent_weight[i_l - 1] / 2.0
448 |             out[i] = 1.0 - dw * (x1 - x) / (x1 - x0) / total_weight
449 |         elif cent_mean[i_l] == x:
450 |             # x is equal to one or more centroids
451 |             i_r = np.searchsorted(cent_mean, x, side="right")
452 |             out[i] = cent_merged_weight[i_r] / total_weight
453 |         else:
454 |             assert cent_mean[i_l] > x
455 |             x0 = cent_mean[i_l - 1]
456 |             x1 = cent_mean[i_l]
457 |             dw = 0.5 * (cent_weight[i_l - 1] + cent_weight[i_l])
458 |             out[i] = (cent_merged_weight[i_l - 1] + dw * (x - x0) / (x1 - x0)) / total_weight
459 | 
460 | 
461 | @guvectorize([void(float64[:], float64[:], float64[:], float64, float64, float64[:]),
462 |             void(float32[:], float64[:], float64[:], float64, float64, float32[:])], "(m),(n),(n),(),()->(m)")
463 | def tdigest_quantile(qv, cent_mean, cent_weight, t_min, t_max, out):
464 |     cent_merged_weight = np.zeros_like(cent_weight)
465 |     cumulative_weight = 0
466 |     for i in range(cent_weight.size):
467 |         cent_merged_weight[i] = cumulative_weight + cent_weight[i] / 2.0
468 |         cumulative_weight += cent_weight[i]
469 |     total_weight = cent_weight.sum()
470 |     for i, q in enumerate(qv):
471 |         if total_weight == 0:
472 |             out[i] = np.nan
473 |             continue
474 |         if q <= 0:
475 |             out[i] = t_min
476 |             continue
477 |         if q >= 1:
478 |             out[i] = t_max
479 |             continue
480 |         if cent_mean.size == 1:
481 |             out[i] = cent_mean[0]
482 |             continue
483 | 
484 |         index = q * total_weight
485 |         b = np.searchsorted(cent_merged_weight, index, side="left")
486 |         if b == 0:
487 |             x0 = 0
488 |             y0 = t_min
489 |         else:
490 |             x0 = cent_merged_weight[b - 1]
491 |             y0 = cent_mean[b - 1]
492 | 
493 |         if b == cent_mean.size:
494 |             x1 = total_weight
495 |             y1 = t_max
496 |         else:
497 |             x1 = cent_merged_weight[b]
498 |             y1 = cent_mean[b]
499 |         out[i] = y0 + (index - x0) * (y1 - y0) / (x1 - x0)
500 | 
501 | 
502 | class DQuantileScaler(DBaseScaler):
503 |     """
504 |     Distributed Quantile Scaler that uses the crick TDigest Cython library to compute quantiles across multiple
505 |     datasets in parallel. The library can perform fitting, transforms, and inverse transforms across variables
506 |     in parallel using the multiprocessing library. Multidimensional arrays are stored in shared memory across
507 |     processes to minimize inter-process communication.
508 |     
509 |     DQuantileScaler supports 
510 | 
511 |     Attributes:
512 |         compression: Recommended number of centroids to use.
513 |         distribution: "uniform", "normal", or "logistic".
514 |         min_val: Minimum value for quantile to prevent -inf results when distribution is normal or logistic.
515 |         max_val: Maximum value for quantile to prevent inf results when distribution is normal or logistic.
516 |         channels_last: Whether to assume the last dim or second dim are the channel/variable dimension.
517 |     """
518 |     def __init__(self, compression=250, distribution="uniform", min_val=0.0000001, max_val=0.9999999, channels_last=True):
519 |         self.compression = compression
520 |         self.distribution = distribution
521 |         self.min_val = min_val
522 |         self.max_val = max_val
523 |         self.centroids_ = None
524 |         self.size_ = None
525 |         self.min_ = None
526 |         self.max_ = None
527 | 
528 |         super().__init__(channels_last=channels_last)
529 | 
530 |     def td_objs_to_attributes(self, td_objs):
531 |         self.centroids_ = [structured_to_unstructured(td_obj.centroids()) for td_obj in td_objs]
532 |         self.size_ = np.array([td_obj.size() for td_obj in td_objs])
533 |         self.min_ = np.array([td_obj.min() for td_obj in td_objs])
534 |         self.max_ = np.array([td_obj.max() for td_obj in td_objs])
535 |         return
536 | 
537 |     def attributes_to_td_objs(self):
538 |         td_objs = []
539 |         if self.is_fit():
540 |             for i in range(self.max_.size):
541 |                 td_objs.append(CTDigest(self.compression))
542 |                 td_objs[-1].__setstate__((unstructured_to_structured(self.centroids_[i], CENTROID_DTYPE),
543 |                                           self.size_[i],
544 |                                           self.min_[i],
545 |                                           self.max_[i]))
546 |         return td_objs
547 | 
548 |     def fit(self, x, weight=None):
549 |         x_columns, is_array = self.extract_x_columns(x, channels_last=self.channels_last)
550 |         xv = self.extract_array(x)
551 |         channel_dim = self.set_channel_dim()
552 |         if not self._fit:
553 |             self.x_columns_ = x_columns
554 |             self.is_array_ = is_array
555 |             fit_var_func = partial(fit_variable,
556 |                                    xv_shared=xv,
557 |                                    compression=self.compression,
558 |                                    channels_last=self.channels_last)
559 |             td_objs = [fit_var_func(x) for x in np.arange(xv.shape[channel_dim])]
560 |             self.td_objs_to_attributes(td_objs)
561 |         else:
562 |             assert x.shape[channel_dim] == self.x_columns_.shape[0], "New data has a different number of columns"
563 |             if is_array:
564 |                 if self.channels_last:
565 |                     x_col_order = np.arange(x.shape[-1])
566 |                 else:
567 |                     x_col_order = np.arange(x.shape[1])
568 |             else:
569 |                 x_col_order = self.get_column_order(x_columns)
570 |             td_objs = self.attributes_to_td_objs()
571 |             fit_var_func = partial(fit_variable,
572 |                                    xv_shared=xv,
573 |                                    compression=self.compression,
574 |                                    channels_last=self.channels_last)
575 |             new_td_objs = [fit_var_func(x) for x in np.arange(xv.shape[channel_dim])]
576 |             for i, o in enumerate(x_col_order):
577 |                 td_objs[o].merge(new_td_objs[i])
578 |             self.td_objs_to_attributes(td_objs)
579 |         self._fit = True
580 |         return
581 | 
582 |     def transform(self, x, channels_last=None, pool=None):
583 |         xv, x_transformed, channels_last, channel_dim, x_col_order = self.process_x_for_transform(x, channels_last)
584 |         td_objs = self.attributes_to_td_objs()
585 |         td_i_objs = [(i, td_objs[o]) for i, o in enumerate(x_col_order)]
586 | 
587 |         trans_var_func = partial(transform_variable,
588 |                                  min_val=self.min_val, max_val=self.max_val,
589 |                                  distribution=self.distribution)
590 |         if channels_last:
591 |             if pool is not None:
592 |                 split_indices = np.round(np.linspace(0, xv[..., 0].size, pool._processes)).astype(int)
593 |                 xt_shape = x_transformed[..., 0].shape
594 |                 outputs = []
595 |                 for td_obj in td_i_objs:
596 |                     for s, s_ind in enumerate(split_indices[1:]):
597 |                         outputs.append(pool.apply_async(trans_var_func, (td_obj[1],
598 |                                                         xv[..., td_obj[0]].ravel()[split_indices[s]:s_ind])))
599 |                     x_transformed[..., td_obj[0]] = np.reshape(np.concatenate([o.get() for o in outputs]), xt_shape)
600 |                     del outputs[:]
601 |             else:
602 |                 for td_obj in td_i_objs:
603 |                     x_transformed[..., td_obj[0]] = trans_var_func(td_obj[1], xv[..., td_obj[0]])
604 |         else:
605 |             if pool is not None:
606 |                 split_indices = np.round(np.linspace(0, xv[..., 0].size, pool._processes)).astype(int)
607 |                 xt_shape = x_transformed[:, 0].shape
608 |                 outputs = []
609 |                 for td_obj in td_i_objs:
610 |                     for s, s_ind in enumerate(split_indices[1:]):
611 |                         outputs.append(pool.apply_async(trans_var_func, (td_obj[1],
612 |                                                         xv[..., td_obj[0]].ravel()[split_indices[s]:s_ind])))
613 |                     x_transformed[:, td_obj[0]] = np.reshape(np.concatenate([o.get() for o in outputs]), xt_shape)
614 |                     del outputs[:]
615 |             else:
616 |                 for td_obj in td_i_objs:
617 |                     x_transformed[:, td_obj[0]] = trans_var_func(td_obj[1], xv[:, td_obj[0]])
618 |         x_transformed_final = self.package_transformed_x(x_transformed, x)
619 |         return x_transformed_final
620 | 
621 |     def fit_transform(self, x, channels_last=None, weight=None, pool=None):
622 |         self.fit(x, weight=weight)
623 |         return self.transform(x, channels_last=channels_last, pool=pool)
624 | 
625 |     def inverse_transform(self, x, channels_last=None, pool=None):
626 |         xv, x_transformed, channels_last, channel_dim, x_col_order = self.process_x_for_transform(x, channels_last)
627 |         td_objs = self.attributes_to_td_objs()
628 |         td_i_objs = [(i, td_objs[o]) for i, o in enumerate(x_col_order)]
629 |         inv_trans_var_func = partial(inv_transform_variable,
630 |                                  distribution=self.distribution)
631 |         if channels_last:
632 |             if pool is not None:
633 |                 split_indices = np.round(np.linspace(0, xv[..., 0].size, pool._processes)).astype(int)
634 |                 xt_shape = x_transformed[..., 0].shape
635 |                 outputs = []
636 |                 for td_obj in td_i_objs:
637 |                     for s, s_ind in enumerate(split_indices[1:]):
638 |                         outputs.append(pool.apply_async(inv_trans_var_func, (td_obj[1],
639 |                                                         xv[..., td_obj[0]].ravel()[split_indices[s]:s_ind])))
640 |                     x_transformed[..., td_obj[0]] = np.reshape(np.concatenate([o.get() for o in outputs]), xt_shape)
641 |                     del outputs[:]
642 |             else:
643 |                 for td_obj in td_i_objs:
644 |                     x_transformed[..., td_obj[0]] = inv_trans_var_func(td_obj[1], xv[..., td_obj[0]])
645 |         else:
646 |             if pool is not None:
647 |                 split_indices = np.round(np.linspace(0, xv[..., 0].size, pool._processes)).astype(int)
648 |                 xt_shape = x_transformed[:, 0].shape
649 |                 outputs = []
650 |                 for td_obj in td_i_objs:
651 |                     for s, s_ind in enumerate(split_indices[1:]):
652 |                         outputs.append(pool.apply_async(inv_trans_var_func, (td_obj[1],
653 |                                                         xv[..., td_obj[0]].ravel()[split_indices[s]:s_ind])))
654 |                     x_transformed[:, td_obj[0]] = np.reshape(np.concatenate([o.get() for o in outputs]), xt_shape)
655 |                     del outputs[:]
656 |             else:
657 |                 for td_obj in td_i_objs:
658 |                     x_transformed[:, td_obj[0]] = inv_trans_var_func(td_obj[1], xv[:, td_obj[0]])
659 |         x_transformed_final = self.package_transformed_x(x_transformed, x)
660 |         return x_transformed_final
661 | 
662 |     def __add__(self, other):
663 |         current = deepcopy(self)
664 |         td_objs = current.attributes_to_td_objs()
665 |         other_td_objs = other.attributes_to_td_objs()
666 |         assert type(other) is DQuantileScaler, "Adding mismatched scaler types."
667 |         assert current.is_fit() and other.is_fit(), "At least one scaler is not fit."
668 |         x_col_order = current.get_column_order(other.x_columns_)
669 |         assert x_col_order.size > 0, "No matching columns in other DQuantileScaler"
670 |         for i, o in enumerate(x_col_order):
671 |             td_objs[o].merge(other_td_objs[i])
672 |         current.td_objs_to_attributes(td_objs)
673 |         return current
674 | 
675 | 


--------------------------------------------------------------------------------