├── bridgescaler ├── VERSION ├── __init__.py ├── tests │ ├── backend_test.py │ ├── deep_test.py │ ├── group_test.py │ ├── distributed_tensor_test.py │ └── distributed_test.py ├── deep.py ├── backend.py ├── group.py ├── distributed_tensor.py └── distributed.py ├── setup.py ├── MANIFEST.in ├── requirements.txt ├── doc ├── source │ ├── _static │ │ ├── logo.graffle │ │ └── bridgescaler_logo.png │ ├── modules.rst │ ├── gettingstarted.rst │ ├── index.rst │ ├── bridgescaler.rst │ ├── group.rst │ ├── conf.py │ ├── usage.rst │ └── distributed.rst ├── Makefile └── make.bat ├── environment.yml ├── scripts ├── numpy_mem.py ├── scipy_ppf_example.py └── eval_scaler.py ├── .readthedocs.yaml ├── setup.cfg ├── LICENSE ├── .github └── workflows │ ├── python-publish.yml │ └── python-package-conda.yml ├── .gitignore ├── README.md └── notebooks └── Bridgscaler_intro.ipynb /bridgescaler/VERSION: -------------------------------------------------------------------------------- 1 | 0.8.0 2 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | if __name__ == "__main__": 4 | setup() -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include bridgescaler/*.py 3 | include bridgescaler/VERSION 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy<2.0 3 | scikit-learn>=1.0 4 | crick 5 | xarray 6 | scipy>=1.11.0 7 | numba 8 | -------------------------------------------------------------------------------- /doc/source/_static/logo.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/bridgescaler/main/doc/source/_static/logo.graffle -------------------------------------------------------------------------------- /doc/source/modules.rst: -------------------------------------------------------------------------------- 1 | bridgescaler 2 | ============ 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | bridgescaler 8 | -------------------------------------------------------------------------------- /doc/source/_static/bridgescaler_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/bridgescaler/main/doc/source/_static/bridgescaler_logo.png -------------------------------------------------------------------------------- /bridgescaler/__init__.py: -------------------------------------------------------------------------------- 1 | from .backend import save_scaler, load_scaler, print_scaler, read_scaler 2 | from .group import GroupStandardScaler, GroupRobustScaler, GroupMinMaxScaler 3 | from .deep import DeepStandardScaler, DeepMinMaxScaler, DeepQuantileTransformer 4 | from .distributed import DStandardScaler, DMinMaxScaler, DQuantileScaler 5 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: bridgescaler 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.11 6 | - pip 7 | - numpy<2 8 | - scipy>=1.11.0 9 | - pandas 10 | - scikit-learn 11 | - pyarrow 12 | - pytest 13 | - sphinx 14 | - xarray 15 | - crick 16 | - numba 17 | - sphinx-book-theme 18 | - pip: 19 | - . 20 | -------------------------------------------------------------------------------- /scripts/numpy_mem.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('agg') 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import psutil 6 | import xarray as xr 7 | mem = [] 8 | def get_data(): 9 | return np.zeros((1000, 50, 50), dtype=np.float32) 10 | data = get_data() 11 | for i in range(data.shape[0]): 12 | data[i] = np.random.random((50, 50)) 13 | mem.append(psutil.virtual_memory()[1]) 14 | mem.append(psutil.virtual_memory()[1]) 15 | xd = xr.DataArray(data) 16 | mem.append(psutil.virtual_memory()[1]) 17 | plt.plot(mem) 18 | plt.savefig("mem_profile.png", dpi=200, bbox_inches="tight") 19 | -------------------------------------------------------------------------------- /doc/source/gettingstarted.rst: -------------------------------------------------------------------------------- 1 | .. title:: Getting Started 2 | 3 | .. gettingstarted: 4 | 5 | Getting Started 6 | =============== 7 | 8 | Dependencies 9 | ------------ 10 | * scikit-learn 11 | * numpy 12 | * pandas 13 | * xarray 14 | * pydigest 15 | 16 | Installation 17 | ------------ 18 | For a stable version of bridgescaler, you can install from PyPI. 19 | 20 | .. code-block:: bash 21 | 22 | pip install bridgescaler 23 | 24 | For the latest version of bridgescaler, install from github. 25 | 26 | .. code-block:: bash 27 | 28 | git clone https://github.com/NCAR/bridgescaler.git 29 | cd bridgescaler 30 | pip install . 31 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | .. bridgescaler documentation master file, created by 2 | sphinx-quickstart on Wed Feb 7 10:59:45 2024. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Bridgescaler 7 | ======================================== 8 | Bridgescaler is a library to support reproducible and 9 | distributed scaling of data for pre-processing of AI and ML models. 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | :caption: Contents: 14 | 15 | gettingstarted.rst 16 | usage.rst 17 | distributed.rst 18 | group.rst 19 | modules.rst 20 | 21 | 22 | 23 | Indices and tables 24 | ================== 25 | 26 | * :ref:`genindex` 27 | * :ref:`modindex` 28 | * :ref:`search` 29 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /doc/source/bridgescaler.rst: -------------------------------------------------------------------------------- 1 | bridgescaler package 2 | ==================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | bridgescaler.backend module 8 | --------------------------- 9 | 10 | .. automodule:: bridgescaler.backend 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | bridgescaler.deep module 16 | ------------------------ 17 | 18 | .. automodule:: bridgescaler.deep 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | bridgescaler.distributed module 24 | ------------------------------- 25 | 26 | .. automodule:: bridgescaler.distributed 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | bridgescaler.group module 32 | ------------------------- 33 | 34 | .. automodule:: bridgescaler.group 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | Module contents 40 | --------------- 41 | 42 | .. automodule:: bridgescaler 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the OS, Python version and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "mambaforge-22.9" 13 | # You can also specify other tool versions: 14 | # nodejs: "19" 15 | # rust: "1.64" 16 | # golang: "1.19" 17 | 18 | conda: 19 | environment: environment.yml 20 | # Build documentation in the "docs/" directory with Sphinx 21 | sphinx: 22 | configuration: doc/source/conf.py 23 | 24 | # Optionally build your docs in additional formats such as PDF and ePub 25 | # formats: 26 | # - pdf 27 | # - epub 28 | 29 | # Optional but recommended, declare the Python requirements required 30 | # to build your documentation 31 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 32 | # python: 33 | # install: 34 | # - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = bridgescaler 3 | description = Tool to automagically save scikit-learn scaler properties to a portable, readable format. 4 | long_description = file: README.md 5 | long_description_content_type = text/markdown 6 | author = David John Gagne 7 | author_email = dgagne@ucar.edu 8 | version = file: bridgescaler/VERSION 9 | license = MIT 10 | license_file = LICENSE 11 | platform = any 12 | keywords = machine learning 13 | classifiers = 14 | Programming Language :: Python 15 | Programming Language :: Python :: 3.8 16 | Programming Language :: Python :: 3.9 17 | Programming Language :: Python :: 3.10 18 | Programming Language :: Python :: 3.11 19 | url = https://github.com/NCAR/bridgescaler 20 | 21 | [options] 22 | zip_safe = True 23 | packages = find: 24 | include_package_data = True 25 | setup_requires = setuptools 26 | python_requires = >=3.7 27 | install_requires = 28 | scikit-learn>=1.0 29 | numpy 30 | pandas 31 | crick 32 | scipy 33 | xarray 34 | numba 35 | sphinx 36 | sphinx-book-theme 37 | -------------------------------------------------------------------------------- /doc/source/group.rst: -------------------------------------------------------------------------------- 1 | .. title:: Group Scalers 2 | 3 | .. group: 4 | 5 | Group Scalers 6 | ============= 7 | 8 | The group scalers use the same scaling parameters for a group of similar 9 | variables rather than scaling each column independently. This is useful for situations where variables are related, 10 | such as temperatures at different height levels. 11 | 12 | Groups are specified as a list of column ids, which can be column names for pandas dataframes or column indices 13 | for numpy arrays. 14 | 15 | For example: 16 | 17 | .. code-block:: python 18 | 19 | from bridgescaler.group import GroupStandardScaler 20 | import pandas as pd 21 | import numpy as np 22 | x_rand = np.random.random(size=(100, 5)) 23 | data = pd.DataFrame(data=x_rand, 24 | columns=["a", "b", "c", "d", "e"]) 25 | groups = [["a", "b"], ["c", "d"], "e"] 26 | group_scaler = GroupStandardScaler() 27 | x_transformed = group_scaler.fit_transform(data, groups=groups) 28 | 29 | "a" and "b" are a single group and all values of both will be included when calculating the mean and standard 30 | deviation for that group. -------------------------------------------------------------------------------- /doc/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | project = 'bridgescaler' 10 | copyright = '2024, University Corporation for Atmopsheric Research' 11 | author = 'David John Gagne' 12 | release = '0.8.0' 13 | 14 | # -- General configuration --------------------------------------------------- 15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 16 | 17 | extensions = ['sphinx.ext.napoleon'] 18 | 19 | templates_path = ['_templates'] 20 | exclude_patterns = [] 21 | 22 | 23 | 24 | # -- Options for HTML output ------------------------------------------------- 25 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 26 | 27 | html_theme = 'sphinx_book_theme' 28 | html_static_path = ['_static'] 29 | html_logo = "_static/bridgescaler_logo.png" 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 University Corporation for Atmospheric Research 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /scripts/scipy_ppf_example.py: -------------------------------------------------------------------------------- 1 | from scipy.stats import norm 2 | from scipy.special import ndtri 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import psutil 6 | import gc 7 | 8 | process = psutil.Process() 9 | n_elements = 301 10 | mem_vals = np.zeros(n_elements) 11 | mem_vals[0] = process.memory_info().rss / 1e6 12 | for i in range(1, n_elements): 13 | x = np.random.random(size=(100, 50, 50)) 14 | ppf_val = ndtri(x) 15 | mem_vals[i] = process.memory_info().rss / 1e6 16 | gc.collect() 17 | plt.plot(mem_vals[1:] - mem_vals[0], label="ndtri") 18 | mem_vals = np.zeros(n_elements) 19 | mem_vals[0] = process.memory_info().rss / 1e6 20 | 21 | for i in range(1, n_elements): 22 | x = np.random.random(size=(100, 50, 50)) 23 | ppf_val = norm.ppf(x) 24 | mem_vals[i] = process.memory_info().rss / 1e6 25 | gc.collect() 26 | plt.plot(mem_vals[1:] - mem_vals[0], label="norm.ppf") 27 | mem_vals = np.zeros(n_elements) 28 | mem_vals[0] = process.memory_info().rss / 1e6 29 | for i in range(1, n_elements): 30 | x = np.random.random(size=(100, 50, 50)) 31 | mem_vals[i] = process.memory_info().rss / 1e6 32 | gc.collect() 33 | plt.plot(mem_vals[1:] - mem_vals[0], label="control") 34 | plt.xlabel("Iterations") 35 | plt.ylabel("Memory usage (MB)") 36 | plt.legend() 37 | plt.savefig("norm_usage_tracking.png", dpi=200, bbox_inches="tight") 38 | -------------------------------------------------------------------------------- /.github/workflows/python-package-conda.yml: -------------------------------------------------------------------------------- 1 | name: Python Package using Conda 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | workflow_dispatch: 9 | 10 | permissions: 11 | actions: write 12 | checks: write 13 | contents: read 14 | pull-requests: write 15 | statuses: write 16 | 17 | jobs: 18 | build-linux: 19 | runs-on: ubuntu-latest 20 | strategy: 21 | max-parallel: 5 22 | defaults: 23 | run: 24 | shell: bash -l {0} 25 | steps: 26 | - uses: actions/checkout@v6 27 | - name: Setup Python 28 | uses: actions/setup-python@v6 29 | with: 30 | python-version: '3.11' 31 | cache: 'pip' 32 | - name: Install dependencies 33 | run: | 34 | python -m pip install --upgrade uv 35 | uv pip install torch --system --index-url https://download.pytorch.org/whl/cpu 36 | uv pip install . --system 37 | uv pip install ruff pytest --system 38 | - name: Lint with ruff 39 | run: | 40 | # stop the build if there are Python syntax errors or undefined names 41 | ruff check --select=E9,F63,F7,F82 --exit-zero 42 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 43 | ruff check --output-format concise --exit-zero 44 | # Checking documentation errors 45 | ruff check --select=D --exit-zero --statistics 46 | - name: Test with pytest 47 | run: | 48 | pytest 49 | -------------------------------------------------------------------------------- /doc/source/usage.rst: -------------------------------------------------------------------------------- 1 | .. title:: Basic Usage 2 | 3 | .. usage: 4 | 5 | Basic Usage 6 | =========== 7 | bridgescaler supports all the common scikit-learn scaler classes: 8 | 9 | * StandardScaler 10 | * RobustScaler 11 | * MinMaxScaler 12 | * MaxAbsScaler 13 | * QuantileTransformer 14 | * PowerTransformer 15 | * SplineTransformer 16 | 17 | First, create some synthetic data to transform. 18 | 19 | .. code-block:: python 20 | 21 | import numpy as np 22 | import pandas as pd 23 | 24 | # specify distribution parameters for each variable 25 | locs = np.array([0, 5, -2, 350.5], dtype=np.float32) 26 | scales = np.array([1.0, 10, 0.1, 5000.0]) 27 | names = ["A", "B", "C", "D"] 28 | num_examples = 205 29 | x_data_dict = {} 30 | for l in range(locs.shape[0]): 31 | # sample from random normal with different parameters 32 | x_data_dict[names[l]] = np.random.normal(loc=locs[l], scale=scales[l], size=num_examples) 33 | x_data = pd.DataFrame(x_data_dict) 34 | 35 | Now, let's fit and transform the data with StandardScaler. 36 | 37 | .. code-block:: python 38 | 39 | from sklearn.preprocessing import StandardScaler 40 | from bridgescaler import save_scaler, load_scaler 41 | scaler = StandardScaler() 42 | scaler.fit_transform(x_data) 43 | filename = "x_standard_scaler.json" 44 | # save to json file 45 | save_scaler(scaler, filename) 46 | # create new StandardScaler from json file information. 47 | new_scaler = load_scaler(filename) 48 | # new_scaler is a StandardScaler object -------------------------------------------------------------------------------- /doc/source/distributed.rst: -------------------------------------------------------------------------------- 1 | .. title:: Distributed Scalers 2 | 3 | .. distributed: 4 | 5 | Distributed Scalers 6 | =================== 7 | The distributed scalers allow you to calculate scaling 8 | parameters on different subsets of a dataset and then combine the scaling factors 9 | together to get representative scaling values for the full dataset. Distributed 10 | Standard Scalers, MinMax Scalers, and Quantile Transformers have been implemented and work with both tabular 11 | and muliti-dimensional patch data in numpy, pandas DataFrame, and xarray DataArray formats. 12 | 13 | By default, the scaler assumes your channel/variable dimension is the last 14 | dimension, but if `channels_last=False` is set in the `__init__`, `transform`, 15 | or `inverse_transform` methods, then the 2nd dimension is assumed to be the variable 16 | dimension. It is possible to fit data with one ordering and then 17 | transform it with a different one. 18 | 19 | For large datasets, it may be expensive to redo the scalers if you want to use a 20 | subset or different ordering of variables. However, in bridgescaler, the 21 | Distributed Scalers all support arbitrary ordering and subsets of variables for transforms if 22 | the input data are in a Xarray DataArray or Pandas DataFrame with variable 23 | names that match the original data. 24 | 25 | Example: 26 | 27 | .. code-block:: python 28 | 29 | from bridgescaler.distributed import DStandardScaler 30 | import numpy as np 31 | 32 | x_1 = np.random.normal(0, 2.2, (20, 5, 4, 8)) 33 | x_2 = np.random.normal(1, 3.5, (25, 4, 8, 5)) 34 | 35 | dss_1 = DStandardScaler(channels_last=False) 36 | dss_2 = DStandardScaler(channels_last=True) 37 | dss_1.fit(x_1) 38 | dss_2.fit(x_2) 39 | dss_combined = np.sum([dss_1, dss_2]) 40 | 41 | dss_combined.transform(x_1, channels_last=False) 42 | 43 | Distributed scalers can be stored in individual json files or within 44 | a pandas DataFrame for easy loading and combining later. 45 | 46 | .. code-block:: python 47 | 48 | import pandas as pd 49 | from bridgescaler import print_scaler, read_scaler 50 | scaler_list = [dss_1, dss_2] 51 | df = pd.DataFrame({"scalers": [print_scaler(s) for s in scaler_list]}) 52 | df.to_parquet("scalers.parquet") 53 | df_new = pd.read_parquet("scalers.parquet") 54 | scaler_objs = df_new["scalers"].apply(read_scaler) 55 | total_scaler = scaler_objs.sum() 56 | 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /bridgescaler/tests/backend_test.py: -------------------------------------------------------------------------------- 1 | from bridgescaler import save_scaler, load_scaler, print_scaler, read_scaler 2 | from bridgescaler.backend import create_synthetic_data 3 | import numpy as np 4 | import os 5 | from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, QuantileTransformer 6 | from bridgescaler.distributed import DStandardScaler, DMinMaxScaler, DQuantileScaler 7 | from pandas import DataFrame 8 | from os.path import exists 9 | 10 | 11 | scaler_objs = {"StandardScaler": StandardScaler, 12 | "MinMaxScaler": MinMaxScaler, 13 | "RobustScaler": RobustScaler, 14 | "QuantileTransformer": QuantileTransformer, 15 | "DStandardScaler": DStandardScaler, 16 | "DMinMaxScaler": DMinMaxScaler, 17 | "DQuantileScaler": DQuantileScaler} 18 | 19 | 20 | def test_scaler_io(): 21 | try: 22 | x_data = create_synthetic_data() 23 | for scaler_name, scaler_obj in scaler_objs.items(): 24 | scaler = scaler_obj() 25 | x_scaled_data = scaler.fit_transform(x_data) 26 | save_scaler(scaler, "test.json") 27 | assert exists("test.json") 28 | loaded_scaler = load_scaler("test.json") 29 | assert type(loaded_scaler) is type(scaler), "Type mismatch" 30 | loaded_scaled_data = loaded_scaler.transform(x_data) 31 | if type(x_scaled_data) is DataFrame: 32 | transform_diff = np.max(np.abs(x_scaled_data.values - loaded_scaled_data.values)) 33 | else: 34 | transform_diff = np.max(np.abs(x_scaled_data - loaded_scaled_data)) 35 | 36 | assert transform_diff < np.finfo(np.float32).eps, scaler_name + " transform does not match" 37 | finally: 38 | if exists("test.json"): 39 | os.remove("test.json") 40 | return 41 | 42 | 43 | def test_scaler_str(): 44 | x_data = create_synthetic_data() 45 | for scaler_name, scaler_obj in scaler_objs.items(): 46 | scaler = scaler_obj() 47 | x_scaled_data = scaler.fit_transform(x_data) 48 | scaler_str = print_scaler(scaler) 49 | loaded_scaler = read_scaler(scaler_str) 50 | assert type(loaded_scaler) is type(scaler), "Type Mismatch" 51 | loaded_scaled_data = loaded_scaler.transform(x_data) 52 | if type(x_scaled_data) is DataFrame: 53 | transform_diff = np.max(np.abs(x_scaled_data.values - loaded_scaled_data.values)) 54 | else: 55 | transform_diff = np.max(np.abs(x_scaled_data - loaded_scaled_data)) 56 | assert transform_diff < np.finfo(np.float32).eps, scaler_name + " transform does not match" 57 | -------------------------------------------------------------------------------- /bridgescaler/tests/deep_test.py: -------------------------------------------------------------------------------- 1 | from bridgescaler.deep import DeepStandardScaler, DeepMinMaxScaler, DeepQuantileTransformer 2 | from sklearn.preprocessing import QuantileTransformer 3 | from bridgescaler import save_scaler, load_scaler 4 | import numpy as np 5 | from os.path import exists 6 | import os 7 | 8 | 9 | def test_deep_standard_scaler(): 10 | save_filename = "deep_standard.json" 11 | try: 12 | np.random.seed(352680) 13 | 14 | n_ex = 5000 15 | n_channels = 4 16 | dim = 32 17 | means = np.array([1, 5, -4, 2.5], dtype=np.float32) 18 | sds = np.array([10, 2, 43.4, 32.], dtype=np.float32) 19 | x = np.zeros((n_ex, dim, dim, n_channels), dtype=np.float32) 20 | for chan in range(n_channels): 21 | x[..., chan] = np.random.normal(means[chan], sds[chan], (n_ex, dim, dim)) 22 | dss = DeepStandardScaler() 23 | dss.fit(x) 24 | x_transformed = dss.transform(x) 25 | x_telephone = dss.inverse_transform(x_transformed) 26 | assert x_transformed.shape == x.shape, "Shape mismatch" 27 | assert np.mean(np.abs(x_telephone - x)) < 10 * np.finfo(np.float32).eps, "Significant differences" 28 | save_scaler(dss, save_filename) 29 | reloaded_scaler = load_scaler(save_filename) 30 | x_t_r = reloaded_scaler.transform(x) 31 | assert np.all(x_transformed == x_t_r), "Scaler reloads properly" 32 | finally: 33 | if exists(save_filename): 34 | os.remove(save_filename) 35 | return 36 | 37 | 38 | def test_deep_minmax_scaler(): 39 | np.random.seed(352680) 40 | n_ex = 5000 41 | n_channels = 4 42 | dim = 32 43 | means = np.array([1, 5, -4, 2.5], dtype=np.float32) 44 | sds = np.array([10, 2, 43.4, 32.], dtype=np.float32) 45 | x = np.zeros((n_ex, dim, dim, n_channels), dtype=np.float32) 46 | for chan in range(n_channels): 47 | x[..., chan] = np.random.normal(means[chan], sds[chan], (n_ex, dim, dim)) 48 | dss = DeepMinMaxScaler() 49 | dss.fit(x) 50 | x_transformed = dss.transform(x) 51 | x_telephone = dss.inverse_transform(x_transformed) 52 | assert x_transformed.shape == x.shape, "Shape mismatch" 53 | assert x_transformed.max() <= 1, "Max greater than 1" 54 | assert x_transformed.min() >= 0, "Min less than 0" 55 | assert np.mean(np.abs(x_telephone - x)) < 50 * np.finfo(np.float32).eps, "Significant differences" 56 | return 57 | 58 | 59 | def test_deep_quantile_transformer(): 60 | np.random.seed(352680) 61 | n_ex = 1000 62 | n_channels = 3 63 | dim = 16 64 | means = np.array([1, 5, -4, 2.5], dtype=np.float64) 65 | sds = np.array([10, 2, 43.4, 32.], dtype=np.float64) 66 | x = np.zeros((n_ex, dim, dim, n_channels), dtype=np.float64) 67 | for chan in range(n_channels): 68 | x[..., chan] = np.random.normal(means[chan], sds[chan], (n_ex, dim, dim)) 69 | dqs = DeepQuantileTransformer(n_quantiles=1000, stochastic=True) 70 | dqs.fit(x) 71 | x_transformed = dqs.transform(x) 72 | x_telephone = dqs.inverse_transform(x_transformed) 73 | reg_qs = QuantileTransformer(n_quantiles=1000, subsample=dim * dim * n_ex) 74 | def flatten_to_2D(X): 75 | return np.reshape(X, newshape=(X.shape[0] * X.shape[1] * X.shape[2], X.shape[-1])) 76 | 77 | x_flat = flatten_to_2D(x) 78 | x_scaled = reg_qs.fit_transform(x_flat) 79 | x_tel_2 = np.reshape(reg_qs.inverse_transform(x_scaled), newshape=(x.shape[0], x.shape[1], x.shape[2], x.shape[3])) 80 | full_diff = np.abs(x - x_telephone).ravel() 81 | reg_diff = np.abs(x_tel_2 - x).ravel() 82 | assert x_transformed.shape == x.shape, "Shape mismatch" 83 | assert x_transformed.max() <= 1, "Max greater than 1" 84 | assert x_transformed.min() >= 0, "Min less than 0" 85 | assert np.max(np.abs(full_diff - reg_diff)) < 1e-8, "significant differences in differences." 86 | assert np.max(np.abs(x_telephone - x)) < 1e-8, "Significant differences" 87 | return -------------------------------------------------------------------------------- /bridgescaler/tests/group_test.py: -------------------------------------------------------------------------------- 1 | from bridgescaler.group import GroupStandardScaler, GroupMinMaxScaler, GroupRobustScaler 2 | from bridgescaler.backend import create_synthetic_data 3 | from bridgescaler import save_scaler, load_scaler 4 | import numpy as np 5 | import pandas as pd 6 | from os.path import exists 7 | import os 8 | 9 | 10 | def test_group_standard_scaler(): 11 | try: 12 | x_data = create_synthetic_data() 13 | x_data_numpy = x_data.values 14 | groups = [["A", "B"], "C", "D"] 15 | n_groups = [[0, 1], 2, 3] 16 | save_filename = "group_test.json" 17 | group_scaler_n = GroupStandardScaler() 18 | n_transformed = group_scaler_n.fit_transform(x_data_numpy, n_groups) 19 | n_inv_transformed = group_scaler_n.inverse_transform(n_transformed) 20 | assert np.max(np.abs(n_inv_transformed - x_data_numpy)) < np.finfo(np.float32).eps 21 | group_scaler = GroupStandardScaler() 22 | transformed_x = group_scaler.fit_transform(x_data, groups) 23 | assert transformed_x.shape == x_data.shape 24 | inverse_x = group_scaler.inverse_transform(transformed_x) 25 | assert inverse_x.shape == x_data.shape 26 | assert np.max(np.abs(inverse_x.values - x_data.values)) < np.finfo(np.float32).eps 27 | save_scaler(group_scaler, save_filename) 28 | reloaded_scaler = load_scaler(save_filename) 29 | reloaded_scale_x = reloaded_scaler.transform(x_data) 30 | assert np.all(transformed_x == reloaded_scale_x) 31 | finally: 32 | if exists("group_test.json"): 33 | os.remove("group_test.json") 34 | return 35 | 36 | def test_group_minmax_scaler(): 37 | try: 38 | x_data = create_synthetic_data() 39 | x_data_numpy = x_data.values 40 | groups = [["A", "B"], "C", "D"] 41 | n_groups = [[0, 1], 2, 3] 42 | save_filename = "group_test.json" 43 | group_scaler_n = GroupMinMaxScaler() 44 | n_transformed = group_scaler_n.fit_transform(x_data_numpy, n_groups) 45 | n_inv_transformed = group_scaler_n.inverse_transform(n_transformed) 46 | assert np.max(np.abs(n_inv_transformed - x_data_numpy)) < np.finfo(np.float32).eps 47 | group_scaler = GroupMinMaxScaler() 48 | transformed_x = group_scaler.fit_transform(x_data, groups) 49 | assert transformed_x.shape == x_data.shape 50 | inverse_x = group_scaler.inverse_transform(transformed_x) 51 | assert inverse_x.shape == x_data.shape 52 | assert np.max(np.abs(inverse_x.values - x_data.values)) < np.finfo(np.float32).eps 53 | save_scaler(group_scaler, save_filename) 54 | reloaded_scaler = load_scaler(save_filename) 55 | reloaded_scale_x = reloaded_scaler.transform(x_data) 56 | assert np.all(transformed_x == reloaded_scale_x) 57 | finally: 58 | if exists("group_test.json"): 59 | os.remove("group_test.json") 60 | return 61 | 62 | 63 | def test_group_robust_scaler(): 64 | try: 65 | x_data = create_synthetic_data() 66 | x_data_numpy = x_data.values 67 | groups = [["A", "B"], "C", "D"] 68 | n_groups = [[0, 1], 2, 3] 69 | save_filename = "group_test.json" 70 | group_scaler_n = GroupRobustScaler() 71 | n_transformed = group_scaler_n.fit_transform(x_data_numpy, n_groups) 72 | n_inv_transformed = group_scaler_n.inverse_transform(n_transformed) 73 | assert np.max(np.abs(n_inv_transformed - x_data_numpy)) < np.finfo(np.float32).eps 74 | group_scaler = GroupRobustScaler() 75 | transformed_x = group_scaler.fit_transform(x_data, groups) 76 | assert transformed_x.shape == x_data.shape 77 | inverse_x = group_scaler.inverse_transform(transformed_x) 78 | assert inverse_x.shape == x_data.shape 79 | assert np.max(np.abs(inverse_x.values - x_data.values)) < np.finfo(np.float32).eps 80 | save_scaler(group_scaler, save_filename) 81 | reloaded_scaler = load_scaler(save_filename) 82 | reloaded_scale_x = reloaded_scaler.transform(x_data) 83 | assert np.all(transformed_x == reloaded_scale_x) 84 | finally: 85 | if exists("group_test.json"): 86 | os.remove("group_test.json") 87 | return -------------------------------------------------------------------------------- /bridgescaler/deep.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class DeepStandardScaler(object): 5 | """ 6 | Calculate standard scaler scores on an arbitrarily dimensional dataset as long as the last dimension is 7 | the variable dimension. 8 | 9 | """ 10 | def __init__(self): 11 | self.mean_ = None 12 | self.sd_ = None 13 | return 14 | 15 | def fit(self, x): 16 | self.mean_ = np.zeros(x.shape[-1], dtype=x.dtype) 17 | self.sd_ = np.zeros(x.shape[-1], dtype=x.dtype) 18 | for v in range(x.shape[-1]): 19 | self.mean_[v] = np.mean(x[..., v]) 20 | self.sd_[v] = np.std(x[..., v], ddof=1) 21 | 22 | def transform(self, x): 23 | x_transformed = np.zeros(x.shape, dtype=x.dtype) 24 | for v in range(x.shape[-1]): 25 | x_transformed[..., v] = (x[..., v] - self.mean_[v]) / self.sd_[v] 26 | return x_transformed 27 | 28 | def fit_transform(self, x): 29 | self.fit(x) 30 | return self.transform(x) 31 | 32 | def inverse_transform(self, x): 33 | x_inverse = np.zeros(x.shape, dtype=x.dtype) 34 | for v in range(x.shape[-1]): 35 | x_inverse[..., v] = x[..., v] * self.sd_[v] + self.mean_[v] 36 | return x_inverse 37 | 38 | 39 | class DeepMinMaxScaler(object): 40 | def __init__(self): 41 | self.max_ = None 42 | self.min_ = None 43 | return 44 | 45 | def fit(self, x): 46 | self.max_ = np.zeros(x.shape[-1], dtype=x.dtype) 47 | self.min_ = np.zeros(x.shape[-1], dtype=x.dtype) 48 | for v in range(x.shape[-1]): 49 | self.max_[v] = np.max(x[..., v]) 50 | self.min_[v] = np.min(x[..., v]) 51 | 52 | def transform(self, x): 53 | x_transformed = np.zeros(x.shape, dtype=x.dtype) 54 | for v in range(x.shape[-1]): 55 | x_transformed[..., v] = (x[..., v] - self.min_[v]) / (self.max_[v] - self.min_[v]) 56 | return x_transformed 57 | 58 | def fit_transform(self, x): 59 | self.fit(x) 60 | return self.transform(x) 61 | 62 | def inverse_transform(self, x): 63 | x_inverse = np.zeros(x.shape, dtype=x.dtype) 64 | for v in range(x.shape[-1]): 65 | x_inverse[..., v] = x[..., v] * (self.max_[v] - self.min_[v]) + self.min_[v] 66 | return x_inverse 67 | 68 | 69 | class DeepQuantileTransformer(object): 70 | """ 71 | Performs a quantile transform on N-dimensional arrays where the variable dimension is the last one. 72 | 73 | Attributes: 74 | n_quantiles: number of quantiles to calculate and store 75 | stochastic: When transforming to quantile space, whether to take the mean of the left and right interpolation values (False) 76 | or to pick a random point in between (True). 77 | """ 78 | def __init__(self, n_quantiles=1000, stochastic=False): 79 | self.n_quantiles = n_quantiles 80 | self.stochastic = stochastic 81 | self.quantiles_ = None 82 | self.references_ = None 83 | self.fitted_ = False 84 | self.x_column_names_ = None 85 | 86 | def fit(self, x): 87 | if hasattr(x, "columns"): 88 | self.x_columns_ = x.columns 89 | else: 90 | self.x_columns_ = np.arange(x.shape[-1]) 91 | self.quantiles_ = np.zeros((x.shape[-1], self.n_quantiles), dtype=x.dtype) 92 | self.references_ = np.linspace(0, 1, self.n_quantiles, endpoint=True) 93 | for v in range(x.shape[-1]): 94 | self.quantiles_[v] = np.nanquantile(x[..., v].ravel(), self.references_) 95 | self.quantiles_[v] = np.maximum.accumulate(self.quantiles_[v]) 96 | return 97 | 98 | def transform(self, x): 99 | x_transformed = np.zeros(x.shape, dtype=x.dtype) 100 | for v in range(x.shape[-1]): 101 | x_transformed[..., v] = self._transform_col(x[..., v].ravel(), v).reshape(x[..., v].shape) 102 | return x_transformed 103 | 104 | def fit_transform(self, x): 105 | self.fit(x) 106 | return self.transform(x) 107 | 108 | def inverse_transform(self, x): 109 | x_transformed = np.zeros(x.shape, dtype=x.dtype) 110 | for v in range(x.shape[-1]): 111 | x_transformed[..., v] = self._inverse_transform_col(x[..., v].ravel(), v).reshape(x[..., v].shape) 112 | return x_transformed 113 | 114 | def _transform_col(self, x_col, col_index): 115 | left_ref = np.interp(x_col, self.quantiles_[col_index], self.references_) 116 | right_ref = -np.interp(-x_col, -self.quantiles_[col_index][::-1], -self.references_[::-1]) 117 | p = 0.5 118 | if self.stochastic: 119 | p = np.random.uniform(0, 1, x_col.size) 120 | return p * left_ref + (1 - p) * right_ref 121 | 122 | def _inverse_transform_col(self, x_col, col_index): 123 | transformed_col = np.interp(x_col, self.references_, self.quantiles_[col_index]) 124 | return transformed_col 125 | -------------------------------------------------------------------------------- /bridgescaler/tests/distributed_tensor_test.py: -------------------------------------------------------------------------------- 1 | # from bridgescaler import save_scaler, load_scaler, print_scaler, read_scaler 2 | from bridgescaler.distributed_tensor import DStandardScalerTensor, DMinMaxScalerTensor 3 | import numpy as np 4 | import torch 5 | import os 6 | 7 | def make_test_data(): 8 | np.random.seed(34325) 9 | test_data = dict() 10 | col_names = ["a", "b", "c", "d", "e"] 11 | test_data["means"] = np.array([0, 5.3, -2.421, 21456.3, 1.e-5]) 12 | test_data["sds"] = np.array([5, 352.2, 1e-4, 20000.3, 5.3e-2]) 13 | test_data["n_examples"] = np.array([1000, 500, 88]) 14 | test_data["numpy_2d"] = [] 15 | test_data["numpy_4d"] = [] 16 | test_data["pandas"] = [] 17 | test_data["xarray"] = [] 18 | tile_width = 5 19 | for n in range(test_data["n_examples"].size): 20 | data2d = np.zeros((test_data["n_examples"][n], test_data["means"].size)) 21 | data4d = np.zeros((test_data["n_examples"][n], tile_width, tile_width, test_data["means"].size)) 22 | for i in range(test_data["means"].size): 23 | data2d[:, i] = np.random.normal(loc=test_data["means"][i], 24 | scale=test_data["sds"][i], 25 | size=test_data["n_examples"][n]) 26 | data4d[..., i] = np.random.normal(loc=test_data["means"][i], 27 | scale=test_data["sds"][i], 28 | size=(test_data["n_examples"][n], tile_width, tile_width)) 29 | test_data["numpy_2d"].append(data2d) 30 | test_data["numpy_4d"].append(data4d) 31 | 32 | return test_data 33 | 34 | 35 | # Create test datasets for use in all unit tests. 36 | test_data = make_test_data() 37 | 38 | def test_dstandard_tensor_scaler(): 39 | numpy_2d_1 = torch.from_numpy(test_data["numpy_2d"][0]) 40 | numpy_2d_2 = torch.from_numpy(test_data["numpy_2d"][1]) 41 | numpy_2d_3 = torch.from_numpy(test_data["numpy_2d"][2]) 42 | all_ds_2d = torch.vstack([numpy_2d_1, numpy_2d_2, numpy_2d_3]) 43 | numpy_4d_1 = torch.from_numpy(test_data["numpy_4d"][0]) 44 | numpy_4d_2 = torch.from_numpy(test_data["numpy_4d"][1]) 45 | numpy_4d_3 = torch.from_numpy(test_data["numpy_4d"][2]) 46 | all_ds_4d = torch.vstack([numpy_4d_1, numpy_4d_2, numpy_4d_3]) 47 | dsses_2d = [] 48 | dsses_4d = [] 49 | for n in range(test_data["n_examples"].size): 50 | dsses_2d.append(DStandardScalerTensor()) 51 | dsses_2d[-1].fit(torch.from_numpy(test_data["numpy_2d"][n])) 52 | dsses_4d.append(DStandardScalerTensor(channels_last=True)) 53 | dsses_4d[-1].fit(torch.from_numpy(test_data["numpy_4d"][n])) 54 | # save_scaler(dsses_2d[-1], "scaler.json") 55 | # new_scaler = load_scaler("scaler.json") 56 | # os.remove("scaler.json") 57 | dss_total_2d = dsses_2d[0] + dsses_2d[1] + dsses_2d[2] 58 | dss_total_4d = dsses_4d[0] + dsses_4d[1] + dsses_4d[2] 59 | mean_2d, var_2d = dss_total_2d.get_scales() 60 | mean_4d, var_4d = dss_total_4d.get_scales() 61 | all_2d_var = all_ds_2d.var(axis=0, unbiased=False) 62 | all_4d_var = torch.tensor([all_ds_4d[..., i].var(unbiased=False) for i in range(all_ds_4d.shape[-1])]) 63 | all_4d_mean = torch.tensor([all_ds_4d[..., i].mean() for i in range(all_ds_4d.shape[-1])]) 64 | assert mean_2d.shape[0] == test_data["means"].shape[0] and var_2d.shape[0] == test_data["sds"].shape[0], "Stat shape mismatch" 65 | assert mean_4d.shape[0] == test_data["means"].shape[0] and var_4d.shape[0] == test_data["sds"].shape[0], "Stat shape mismatch" 66 | assert torch.max(torch.abs(mean_2d - all_ds_2d.mean(axis=0))) < 1e-5, "significant difference in means" 67 | assert torch.max(torch.abs(var_2d - all_2d_var) / all_2d_var) < 1e-5, "significant difference in variances" 68 | assert torch.max(torch.abs(mean_4d - all_4d_mean) / all_4d_mean) < 1e-5, "significant difference in means" 69 | assert torch.max(torch.abs(var_4d - all_4d_var) / all_4d_var) < 1e-5, "significant difference in variances" 70 | 71 | 72 | def test_dminmax_tensor_scaler(): 73 | numpy_2d_1 = torch.from_numpy(test_data["numpy_2d"][0]) 74 | numpy_2d_2 = torch.from_numpy(test_data["numpy_2d"][1]) 75 | numpy_2d_3 = torch.from_numpy(test_data["numpy_2d"][2]) 76 | all_ds_2d = torch.vstack([numpy_2d_1, numpy_2d_2, numpy_2d_3]) 77 | numpy_4d_1 = torch.from_numpy(test_data["numpy_4d"][0]) 78 | numpy_4d_2 = torch.from_numpy(test_data["numpy_4d"][1]) 79 | numpy_4d_3 = torch.from_numpy(test_data["numpy_4d"][2]) 80 | all_ds_4d = torch.vstack([numpy_4d_1, numpy_4d_2, numpy_4d_3]) 81 | dsses_2d = [] 82 | dsses_4d = [] 83 | for n in range(test_data["n_examples"].size): 84 | dsses_2d.append(DMinMaxScalerTensor()) 85 | dsses_2d[-1].fit(torch.from_numpy(test_data["numpy_2d"][n])) 86 | dsses_4d.append(DMinMaxScalerTensor()) 87 | dsses_4d[-1].fit(torch.from_numpy(test_data["numpy_4d"][n])) 88 | #save_scaler(dsses_2d[-1], "scaler.json") 89 | #new_scaler = load_scaler("scaler.json") 90 | #os.remove("scaler.json") 91 | dss_total_2d = dsses_2d[0] + dsses_2d[1] + dsses_2d[2] 92 | dss_total_4d = dsses_4d[0] + dsses_4d[1] + dsses_4d[2] 93 | min_2d, max_2d = dss_total_2d.get_scales() 94 | min_4d, max_4d = dss_total_4d.get_scales() 95 | assert torch.max(torch.abs(min_2d - all_ds_2d.min(axis=0).values)) < 1e-8, "significant difference in minimum" 96 | assert torch.max(torch.abs(max_2d - all_ds_2d.max(axis=0).values)) < 1e-8, "significant difference in maximum" 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bridgescaler 2 | Bridge your scikit-learn-style scaler parameters between Python sessions and users. 3 | Bridgescaler allows you to save the properties of a scikit-learn-style scaler object 4 | to a json file, and then repopulate a new scaler object with the same properties. 5 | 6 | 7 | ## Dependencies 8 | * scikit-learn 9 | * numpy 10 | * pandas 11 | * xarray 12 | * pytdigest 13 | 14 | ## Installation 15 | For a stable version of bridgescaler, you can install from PyPI. 16 | ```bash 17 | pip install bridgescaler 18 | ``` 19 | 20 | For the latest version of bridgescaler, install from github. 21 | ```bash 22 | git clone https://github.com/NCAR/bridgescaler.git 23 | cd bridgescaler 24 | pip install . 25 | ``` 26 | 27 | ## Usage 28 | bridgescaler supports all the common scikit-learn scaler classes: 29 | * StandardScaler 30 | * RobustScaler 31 | * MinMaxScaler 32 | * MaxAbsScaler 33 | * QuantileTransformer 34 | * PowerTransformer 35 | * SplineTransformer 36 | 37 | First, create some synthetic data to transform. 38 | ```python 39 | import numpy as np 40 | import pandas as pd 41 | 42 | # specify distribution parameters for each variable 43 | locs = np.array([0, 5, -2, 350.5], dtype=np.float32) 44 | scales = np.array([1.0, 10, 0.1, 5000.0]) 45 | names = ["A", "B", "C", "D"] 46 | num_examples = 205 47 | x_data_dict = {} 48 | for l in range(locs.shape[0]): 49 | # sample from random normal with different parameters 50 | x_data_dict[names[l]] = np.random.normal(loc=locs[l], scale=scales[l], size=num_examples) 51 | x_data = pd.DataFrame(x_data_dict) 52 | ``` 53 | 54 | Now, let's fit and transform the data with StandardScaler. 55 | ```python 56 | from sklearn.preprocessing import StandardScaler 57 | from bridgescaler import save_scaler, load_scaler 58 | 59 | scaler = StandardScaler() 60 | scaler.fit_transform(x_data) 61 | filename = "x_standard_scaler.json" 62 | # save to json file 63 | save_scaler(scaler, filename) 64 | 65 | # create new StandardScaler from json file information. 66 | new_scaler = load_scaler(filename) # new_scaler is a StandardScaler object 67 | ``` 68 | ### Distributed Scaler 69 | The distributed scalers allow you to calculate scaling 70 | parameters on different subsets of a dataset and then combine the scaling factors 71 | together to get representative scaling values for the full dataset. Distributed 72 | Standard Scalers, MinMax Scalers, and Quantile Transformers have been implemented and work with both tabular 73 | and muliti-dimensional patch data in numpy, pandas DataFrame, and xarray DataArray formats. 74 | By default, the scaler assumes your channel/variable dimension is the last 75 | dimension, but if `channels_last=False` is set in the `__init__`, `transform`, 76 | or `inverse_transform` methods, then the 2nd dimension is assumed to be the variable 77 | dimension. It is possible to fit data with one ordering and then 78 | transform it with a different one. 79 | 80 | For large datasets, it may be expensive to redo the scalers if you want to use a 81 | subset or different ordering of variables. However, in bridgescaler, the 82 | Distributed Scalers all support arbitrary ordering and subsets of variables for transforms if 83 | the input data are in a Xarray DataArray or Pandas DataFrame with variable 84 | names that match the original data. 85 | 86 | Example: 87 | ```python 88 | from bridgescaler.distributed import DStandardScaler 89 | import numpy as np 90 | 91 | x_1 = np.random.normal(0, 2.2, (20, 5, 4, 8)) 92 | x_2 = np.random.normal(1, 3.5, (25, 4, 8, 5)) 93 | 94 | dss_1 = DStandardScaler(channels_last=False) 95 | dss_2 = DStandardScaler(channels_last=True) 96 | dss_1.fit(x_1) 97 | dss_2.fit(x_2) 98 | dss_combined = np.sum([dss_1, dss_2]) 99 | 100 | dss_combined.transform(x_1, channels_last=False) 101 | ``` 102 | 103 | ### Group Scaler 104 | The group scalers use the same scaling parameters for a group of similar 105 | variables rather than scaling each column independently. This is useful for situations where variables are related, 106 | such as temperatures at different height levels. 107 | 108 | Groups are specified as a list of column ids, which can be column names for pandas dataframes or column indices 109 | for numpy arrays. 110 | 111 | For example: 112 | ```python 113 | from bridgescaler.group import GroupStandardScaler 114 | import pandas as pd 115 | import numpy as np 116 | x_rand = np.random.random(size=(100, 5)) 117 | data = pd.DataFrame(data=x_rand, 118 | columns=["a", "b", "c", "d", "e"]) 119 | groups = [["a", "b"], ["c", "d"], "e"] 120 | group_scaler = GroupStandardScaler() 121 | x_transformed = group_scaler.fit_transform(data, groups=groups) 122 | ``` 123 | 124 | "a" and "b" are a single group and all values of both will be included when calculating the mean and standard 125 | deviation for that group. 126 | 127 | ### Deep Scaler 128 | The deep scalers are designed to scale 2 or 3-dimensional fields input into a 129 | deep learning model such as a convolutional neural network. The scalers assume 130 | that the last dimension is the channel/variable dimension and scales the values accordingly. 131 | The scalers can support 2D or 3D patches with no change in code structure. Support is provided for 132 | DeepStandardScaler and DeepQuantileTransformer. 133 | 134 | Example: 135 | ```python 136 | from bridgescaler.deep import DeepStandardScaler 137 | import numpy as np 138 | np.random.seed(352680) 139 | n_ex = 5000 140 | n_channels = 4 141 | dim = 32 142 | means = np.array([1, 5, -4, 2.5], dtype=np.float32) 143 | sds = np.array([10, 2, 43.4, 32.], dtype=np.float32) 144 | x = np.zeros((n_ex, dim, dim, n_channels), dtype=np.float32) 145 | for chan in range(n_channels): 146 | x[..., chan] = np.random.normal(means[chan], sds[chan], (n_ex, dim, dim)) 147 | dss = DeepStandardScaler() 148 | dss.fit(x) 149 | x_transformed = dss.transform(x) 150 | ``` 151 | -------------------------------------------------------------------------------- /bridgescaler/backend.py: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import (StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, QuantileTransformer, 2 | SplineTransformer, PowerTransformer) 3 | from bridgescaler.group import GroupStandardScaler, GroupRobustScaler, GroupMinMaxScaler 4 | from bridgescaler.deep import DeepStandardScaler, DeepMinMaxScaler, DeepQuantileTransformer 5 | from bridgescaler.distributed import DStandardScaler, DMinMaxScaler, DQuantileScaler 6 | import numpy as np 7 | import json 8 | import pandas as pd 9 | from numpy.lib.format import descr_to_dtype, dtype_to_descr 10 | from base64 import b64decode, b64encode 11 | from typing import Any 12 | 13 | 14 | scaler_objs = {"StandardScaler": StandardScaler, 15 | "MinMaxScaler": MinMaxScaler, 16 | "RobustScaler": RobustScaler, 17 | "MaxAbsScaler": MaxAbsScaler, 18 | "SplineTransformer": SplineTransformer, 19 | "PowerTransformer": PowerTransformer, 20 | "QuantileTransformer": QuantileTransformer, 21 | "GroupStandardScaler": GroupStandardScaler, 22 | "GroupRobustScaler": GroupRobustScaler, 23 | "GroupMinMaxScaler": GroupMinMaxScaler, 24 | "DeepStandardScaler": DeepStandardScaler, 25 | "DeepMinMaxScaler": DeepMinMaxScaler, 26 | "DeepQuantileTransformer": DeepQuantileTransformer, 27 | "DStandardScaler": DStandardScaler, 28 | "DMinMaxScaler": DMinMaxScaler, 29 | "DQuantileScaler": DQuantileScaler, 30 | } 31 | 32 | 33 | def save_scaler(scaler, scaler_file): 34 | """ 35 | Save a scikit-learn or bridgescaler scaler object to json format. 36 | 37 | Args: 38 | scaler: scikit-learn-style scaler object 39 | scaler_file: path to json file where scaler information is stored. 40 | """ 41 | scaler_params = scaler.__dict__ 42 | scaler_params["type"] = str(type(scaler))[1:-2].split(".")[-1] 43 | with open(scaler_file, "w") as file_obj: 44 | json.dump(scaler_params, file_obj, indent=4, sort_keys=True, cls=NumpyEncoder) 45 | return 46 | 47 | 48 | def print_scaler(scaler): 49 | """ 50 | Output scikit-learn or bridgescaler scaler object to json string. 51 | 52 | Args: 53 | scaler: scikit-learn-style scaler object 54 | 55 | Returns: 56 | str representation of object in json format 57 | """ 58 | scaler_params = scaler.__dict__ 59 | scaler_params["type"] = str(type(scaler))[1:-2].split(".")[-1] 60 | return json.dumps(scaler_params, indent=4, sort_keys=True, cls=NumpyEncoder) 61 | 62 | 63 | def object_hook(dct: dict[Any, Any]): 64 | if "__numpy__" in dct: 65 | np_obj = np.frombuffer( 66 | b64decode(dct["__numpy__"]), descr_to_dtype(dct["dtype"]) 67 | ) 68 | return np_obj.reshape(shape) if (shape := dct["shape"]) else np_obj[0] 69 | return dct 70 | 71 | def read_scaler(scaler_str): 72 | """ 73 | Initialize scikit-learn or bridgescaler scaler from json str. 74 | 75 | Args: 76 | scaler_str: json str 77 | 78 | Returns: 79 | scaler object. 80 | """ 81 | scaler_params = json.loads(scaler_str, object_hook=object_hook) 82 | scaler = scaler_objs[scaler_params["type"]]() 83 | del scaler_params["type"] 84 | for k, v in scaler_params.items(): 85 | if isinstance(v, dict) and v["object"] == "ndarray": 86 | setattr(scaler, k, np.array(v['data'], dtype=v['dtype']).reshape(v['shape'])) 87 | else: 88 | setattr(scaler, k, v) 89 | return scaler 90 | 91 | 92 | def load_scaler(scaler_file): 93 | """ 94 | Initialize scikit-learn or bridgescaler scaler from saved json file. 95 | 96 | Args: 97 | scaler_file: path to json file. 98 | 99 | Returns: 100 | scaler object. 101 | """ 102 | with open(scaler_file, "r") as file_obj: 103 | scaler_str = file_obj.read() 104 | return read_scaler(scaler_str) 105 | 106 | 107 | class NumpyEncoder(json.JSONEncoder): 108 | """ Custom encoder for numpy data types """ 109 | 110 | def default(self, obj): 111 | if int(np.__version__.split('.')[0]) >= 2: 112 | float_types = (np.float16, np.float32, np.float64) 113 | else: 114 | float_types = (np.float_, np.float16, np.float32, np.float64) 115 | 116 | if isinstance(obj, (np.int_, np.intc, np.intp, np.int8, 117 | np.int16, np.int32, np.int64, np.uint8, 118 | np.uint16, np.uint32, np.uint64)): 119 | 120 | return int(obj) 121 | 122 | elif isinstance(obj, float_types): 123 | return float(obj) 124 | 125 | elif isinstance(obj, (np.complex64, np.complex128)): 126 | return {'real': obj.real, 'imag': obj.imag} 127 | 128 | elif isinstance(obj, (np.ndarray,)) and obj.dtype == "|O": 129 | return {'object': 'ndarray', 'dtype': obj.dtype.str, 'shape': list(obj.shape), 130 | 'data': obj.ravel().tolist()} 131 | 132 | elif isinstance(obj, (np.ndarray, np.generic)): 133 | return { 134 | "__numpy__": b64encode( 135 | obj.data if obj.flags.c_contiguous else obj.tobytes() 136 | ).decode(), 137 | "dtype": dtype_to_descr(obj.dtype), 138 | "shape": obj.shape, 139 | } 140 | 141 | elif isinstance(obj, (np.bool_)): 142 | return bool(obj) 143 | 144 | elif isinstance(obj, (np.void)): 145 | return None 146 | 147 | return json.JSONEncoder.default(self, obj) 148 | 149 | 150 | def create_synthetic_data(): 151 | locs = np.array([0, 5, -2, 350.5], dtype=np.float32) 152 | scales = np.array([1.0, 10, 0.1, 5000.0]) 153 | names = ["A", "B", "C", "D"] 154 | num_examples = 205 155 | x_data_dict = {} 156 | for l in range(locs.shape[0]): 157 | x_data_dict[names[l]] = np.random.normal(loc=locs[l], scale=scales[l], size=num_examples) 158 | x_data = pd.DataFrame(x_data_dict) 159 | return x_data 160 | -------------------------------------------------------------------------------- /scripts/eval_scaler.py: -------------------------------------------------------------------------------- 1 | from bridgescaler.distributed import DStandardScaler, DMinMaxScaler, DQuantileTransformer, DQuantileScaler 2 | from bridgescaler import save_scaler, load_scaler 3 | import numpy as np 4 | import pandas as pd 5 | import xarray as xr 6 | import os 7 | from multiprocessing import Pool 8 | import psutil 9 | from scipy.special import ndtri 10 | from scipy.stats import norm 11 | from memory_profiler import profile 12 | 13 | def make_test_data(): 14 | np.random.seed(34325) 15 | test_data = dict() 16 | col_names = ["a", "b", "c", "d", "e"] 17 | test_data["means"] = np.array([0, 5.3, -2.421, 21456.3, 1.e-5]) 18 | test_data["sds"] = np.array([5, 352.2, 1e-4, 20000.3, 5.3e-2]) 19 | test_data["n_examples"] = np.array([100000, 500, 88]) 20 | test_data["numpy_2d"] = [] 21 | test_data["numpy_4d"] = [] 22 | test_data["pandas"] = [] 23 | test_data["xarray"] = [] 24 | tile_width = 5 25 | for n in range(test_data["n_examples"].size): 26 | data2d = np.zeros((test_data["n_examples"][n], test_data["means"].size)) 27 | data4d = np.zeros((test_data["n_examples"][n], tile_width, tile_width, test_data["means"].size)) 28 | for i in range(test_data["means"].size): 29 | data2d[:, i] = np.random.normal(loc=test_data["means"][i], 30 | scale=test_data["sds"][i], 31 | size=test_data["n_examples"][n]) 32 | data4d[..., i] = np.random.normal(loc=test_data["means"][i], 33 | scale=test_data["sds"][i], 34 | size=(test_data["n_examples"][n], tile_width, tile_width)) 35 | test_data["numpy_2d"].append(data2d) 36 | test_data["numpy_4d"].append(data4d) 37 | test_data["pandas"].append(pd.DataFrame(data2d, columns=col_names, index=np.arange(data2d.shape[0]))) 38 | test_data["xarray"].append(xr.DataArray(data4d, 39 | dims=("batch", "y", "x", "variable"), 40 | coords=dict(batch=np.arange(test_data["n_examples"][n]), 41 | y=np.arange(tile_width), 42 | x=np.arange(tile_width), 43 | variable=col_names))) 44 | 45 | return test_data 46 | 47 | def eval_dquantile_scaler(test_data): 48 | np.random.seed(536) 49 | dsses_2d = [] 50 | dsses_4d = [] 51 | #pool = None 52 | pool = Pool(8) 53 | for n in range(test_data["n_examples"].size): 54 | dsses_2d.append(DQuantileScaler()) 55 | dsses_2d[-1].fit(test_data["numpy_2d"][n]) 56 | dsses_4d.append(DQuantileScaler()) 57 | dsses_4d[-1].fit(test_data["numpy_4d"][n]) 58 | ds_2d_transformed = dsses_2d[-1].transform(test_data["numpy_2d"][n], pool=pool) 59 | ds_4d_transformed = dsses_4d[-1].transform(test_data["numpy_4d"][n], pool=pool) 60 | ds_2d_it = dsses_2d[-1].inverse_transform(ds_2d_transformed, pool=pool) 61 | ds_4d_it = dsses_4d[-1].inverse_transform(ds_4d_transformed, pool=pool) 62 | assert ds_2d_transformed.max() <= 1, "Quantile transform > 1" 63 | assert ds_4d_transformed.max() <= 1, "Quantile transform > 1" 64 | save_scaler(dsses_2d[-1], "scaler.json") 65 | new_scaler = load_scaler("scaler.json") 66 | os.remove("scaler.json") 67 | assert np.nanargmax(np.abs((new_scaler.min_ - dsses_2d[-1].min_))) == 0, \ 68 | "Differences in scaler centroid values after loading" 69 | pd_dss = DQuantileScaler() 70 | pd_trans = pd_dss.fit_transform(test_data["pandas"][0], pool=pool) 71 | pd_inv_trans = pd_dss.inverse_transform(pd_trans, pool=pool) 72 | sub_cols = ["d", "b"] 73 | pd_sub_trans = pd_dss.transform(test_data["pandas"][0][sub_cols], pool=pool) 74 | assert pd_sub_trans.shape[1] == len(sub_cols), "Did not subset properly" 75 | pd_sub_inv_trans = pd_dss.inverse_transform(pd_sub_trans, pool=pool) 76 | assert pd_sub_inv_trans.shape[1] == len(sub_cols), "Did not subset properly on inverse." 77 | assert type(pd_trans) is type(test_data["pandas"][0]), "Pandas DataFrame type not passed through transform" 78 | assert type(pd_inv_trans) is type(test_data["pandas"][0]), "Pandas DataFrame type not passed through inverse" 79 | xr_dss = DQuantileScaler(distribution="normal") 80 | xr_trans = xr_dss.fit_transform(test_data["xarray"][0], pool=pool) 81 | xr_inv_trans = xr_dss.inverse_transform(xr_trans, pool=pool) 82 | assert np.all(~np.isnan(xr_trans)), "nans in transform" 83 | assert np.all(~np.isnan(xr_inv_trans)), "nans in inverse transform" 84 | assert xr_trans.shape == test_data["xarray"][0].shape, "shape does not match" 85 | assert xr_inv_trans.shape == test_data["xarray"][0].shape, "shape does not match" 86 | 87 | #assert np.max(np.abs(xr_inv_trans.values - test_data["xarray"][0].values)) < 1e-3, "Differences in transform" 88 | combined_scaler = np.sum(dsses_2d) 89 | assert combined_scaler.size_[0] == test_data["n_examples"].sum(), \ 90 | "Summing did not work properly." 91 | test_data_c_first = test_data["xarray"][0].transpose("batch", "variable", "y", "x").astype("float32") 92 | xr_dss_first = xr_dss.transform(test_data_c_first, channels_last=False, pool=pool) 93 | xr_inv_dss_first = xr_dss.inverse_transform(xr_dss_first, channels_last=False, pool=pool) 94 | assert xr_dss_first.shape == xr_inv_dss_first.shape, "shape does not match" 95 | xr_dss_f = DQuantileScaler(distribution="normal", channels_last=False) 96 | xr_dss_f.fit(test_data_c_first) 97 | scaled_data_quantile_first = xr_dss_f.transform(test_data_c_first, pool=pool) 98 | assert scaled_data_quantile_first.shape == test_data_c_first.shape 99 | if pool is not None: 100 | pool.close() 101 | pool.join() 102 | return 103 | 104 | def small_eval(test_data): 105 | process = psutil.Process() 106 | 107 | # Record initial memory usage 108 | 109 | test_data_c_first = test_data["xarray"][0].transpose("batch", "variable", "y", "x").astype("float32") 110 | xr_dss_f = DQuantileScaler(distribution="normal", channels_last=False) 111 | xr_dss_f.fit(test_data_c_first) 112 | bt_memory = process.memory_info().rss 113 | initial_memory = process.memory_info().rss 114 | print(initial_memory/1e6) 115 | xr_dss_f.distribution = None 116 | test_data_c_first = xr_dss_f.transform(test_data_c_first) 117 | test_data_c_sec = ndtri(test_data_c_first) 118 | output_arr = np.full((1000, 50, 50), 0.5) 119 | output_arr = norm.ppf(output_arr) 120 | output_arr = np.full((1000, 50, 50), 0.5) 121 | output_arr = ndtri(output_arr) 122 | at_memory = process.memory_info().rss 123 | print("final mem:", at_memory / 1e6) 124 | 125 | print("mem diff:", (at_memory - bt_memory) / 1e6) 126 | return test_data_c_first 127 | 128 | 129 | if __name__ == "__main__": 130 | from time import time 131 | 132 | start = time() 133 | test_data = make_test_data() 134 | test_data_c_first = test_data["xarray"][0].transpose("batch", "variable", "y", "x").astype("float32") 135 | print(test_data["xarray"][0]) 136 | test_data_c_first[:] = small_eval(test_data) 137 | #eval_dquantile_scaler(test_data) 138 | stop = time() 139 | print(stop - start) 140 | -------------------------------------------------------------------------------- /bridgescaler/group.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from copy import copy, deepcopy 3 | 4 | 5 | class GroupBaseScaler(object): 6 | def __init__(self): 7 | self.groups_ = None 8 | self.group_index_ = None 9 | self.x_columns_ = None 10 | 11 | def extract_x_columns(self, x): 12 | """ 13 | Extract the variable names to be transformed from x depending on if x is a pandas DataFrame, an 14 | xarray DataArray, or a numpy array. All of these assume that the columns are in the last dimension. 15 | If x is an xarray DataArray, there should be a coorindate variable with the same name as the last dimension 16 | of the DataArray being transformed. 17 | 18 | Args: 19 | x (Union[pandas.DataFrame, xarray.DataArray, numpy.ndarray]): array of values to be transformed. 20 | 21 | Returns: 22 | xv (numpy.ndarray): Array of values to be transformed. 23 | """ 24 | if hasattr(x, "columns"): 25 | self.x_columns_ = x.columns 26 | xv = x.values 27 | elif hasattr(x, "coords"): 28 | var_dim = x.dims[-1] 29 | self.x_columns_ = x.coords[var_dim].values 30 | xv = x.values 31 | else: 32 | self.x_columns_ = np.arange(x.shape[-1]) 33 | xv = x 34 | return xv 35 | 36 | @staticmethod 37 | def package_transformed_x(x_transformed, x): 38 | """ 39 | Repackaged a transformed numpy array into the same datatype as the original x, including 40 | all metadata. 41 | 42 | Args: 43 | x_transformed (numpy.ndarray): array after being transformed or inverse transformed 44 | x (Union[pandas.DataFrame, xarray.DataArray, numpy.ndarray]): 45 | 46 | Returns: 47 | 48 | """ 49 | if hasattr(x, "columns"): 50 | x_packaged = copy(x) 51 | x_packaged.loc[:, :] = x_transformed 52 | elif hasattr(x, "coords"): 53 | x_packaged = copy(x) 54 | x_packaged[:] = x_transformed 55 | else: 56 | x_packaged = x_transformed 57 | return x_packaged 58 | 59 | def fit(self, x, groups=None): 60 | self._fit(x, groups) 61 | return self 62 | 63 | def fit_transform(self, x, groups=None): 64 | self._fit(x, groups) 65 | return self.transform(x) 66 | 67 | def transform(self, x): 68 | transformed_x = deepcopy(x) 69 | is_df = hasattr(x, "columns") 70 | for column in self.x_columns_: 71 | group_index = self.find_group(column) 72 | if is_df: 73 | transformed_x.loc[:, column] = self._transform_column(x[column], group_index) 74 | else: 75 | transformed_x[:, column] = self._transform_column(x[:, column], group_index) 76 | return transformed_x 77 | 78 | def inverse_transform(self, x): 79 | transformed_x = deepcopy(x) 80 | is_df = hasattr(x, "columns") 81 | for column in self.x_columns_: 82 | group_index = self.find_group(column) 83 | if is_df: 84 | transformed_x.loc[:, column] = self._inverse_transform_column(x[column], group_index) 85 | else: 86 | transformed_x[:, column] = self._inverse_transform_column(x[:, column], group_index) 87 | return transformed_x 88 | 89 | def set_groups(self, x, groups): 90 | if groups is None: 91 | if hasattr(x, "columns"): 92 | self.groups_ = list(x.columns) 93 | self.x_columns_ = list(x.columns) 94 | else: 95 | self.groups_ = list(range(x.shape[1])) 96 | self.x_columns_ = list(range(x.shape[1])) 97 | else: 98 | self.groups_ = groups 99 | if hasattr(x, "columns"): 100 | self.x_columns_ = list(x.columns) 101 | else: 102 | self.x_columns_ = list(range(x.shape[1])) 103 | self.group_index_ = np.arange(len(self.groups_)) 104 | 105 | def find_group(self, var_name): 106 | group_index = -1 107 | for g, group in enumerate(self.groups_): 108 | if type(group) is not list and var_name == group: 109 | group_index = g 110 | elif type(group) is list and var_name in group: 111 | group_index = g 112 | assert group_index >= 0, var_name + " not found in groups." 113 | return group_index 114 | 115 | def _fit(self, x, groups): 116 | raise NotImplementedError 117 | 118 | def _transform_column(self, x, group_index): 119 | raise NotImplementedError 120 | 121 | def _inverse_transform_column(self, x, group_index): 122 | raise NotImplementedError 123 | 124 | 125 | class GroupStandardScaler(GroupBaseScaler): 126 | """ 127 | Scaler that enables calculation and sharing of scaling parameters among multiple variables via variable groupings. 128 | This is useful for situations where variables are related, such as temperatures at different height levels. 129 | 130 | Groups are specified as a list of column ids, which can be column names for pandas dataframes or column indices 131 | for numpy arrays. 132 | 133 | For example: 134 | ``` 135 | groups = [["a", "b"], ["c", "d"], "e"] 136 | ``` 137 | "a" and "b" are a single group and all values of both will be included when calculating the mean and standard 138 | deviation for that group. 139 | """ 140 | def __init__(self): 141 | self.center_ = None 142 | self.scale_ = None 143 | super().__init__() 144 | 145 | def _fit(self, x, groups=None): 146 | self.set_groups(x, groups) 147 | self.center_ = np.zeros(self.group_index_.shape) 148 | self.scale_ = np.zeros(self.group_index_.shape) 149 | is_df = hasattr(x, "columns") 150 | for g in self.group_index_: 151 | if is_df: 152 | self.center_[g] = np.mean(x[self.groups_[g]].values) 153 | self.scale_[g] = np.std(x[self.groups_[g]].values) 154 | else: 155 | self.center_[g] = np.mean(x[:, self.groups_[g]]) 156 | self.scale_[g] = np.std(x[:, self.groups_[g]]) 157 | 158 | return 159 | 160 | def _transform_column(self, x_column, group_index): 161 | return (x_column - self.center_[group_index]) / self.scale_[group_index] 162 | 163 | def _inverse_transform_column(self, x_column, group_index): 164 | return x_column * self.scale_[group_index] + self.center_[group_index] 165 | 166 | 167 | class GroupMinMaxScaler(GroupBaseScaler): 168 | """ 169 | Group version of MinMaxScaler 170 | """ 171 | def __init__(self, feature_range=(0, 1)): 172 | self.feature_range = feature_range 173 | self.mins_ = None 174 | self.maxes_ = None 175 | GroupBaseScaler.__init__(self) 176 | return 177 | 178 | def _fit(self, x, groups): 179 | self.set_groups(x, groups) 180 | self.mins_ = np.zeros(self.group_index_.shape) 181 | self.maxes_ = np.zeros(self.group_index_.shape) 182 | is_df = hasattr(x, "columns") 183 | for g in self.group_index_: 184 | if is_df: 185 | self.mins_[g] = np.min(x[self.groups_[g]].values) 186 | self.maxes_[g] = np.max(x[self.groups_[g]].values) 187 | else: 188 | self.mins_[g] = np.min(x[:, self.groups_[g]]) 189 | self.maxes_[g] = np.max(x[:, self.groups_[g]]) 190 | return 191 | 192 | def _transform_column(self, x_column, group_index): 193 | x_normed = (x_column - self.mins_[group_index]) / (self.maxes_[group_index] - self.mins_[group_index]) 194 | return x_normed * (self.feature_range[1] - self.feature_range[0]) + self.feature_range[0] 195 | 196 | def _inverse_transform_column(self, x_column, group_index): 197 | x_normed = (x_column - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0]) 198 | return x_normed * (self.maxes_[group_index] - self.mins_[group_index]) + self.mins_[group_index] 199 | 200 | 201 | class GroupRobustScaler(GroupBaseScaler): 202 | """ 203 | Group version of RobustScaler 204 | 205 | """ 206 | def __init__(self, quartile_range=(25.0, 75.0)): 207 | self.quartile_range = quartile_range 208 | self.center_ = None 209 | self.scale_ = None 210 | super().__init__() 211 | 212 | def _fit(self, x, groups): 213 | self.set_groups(x, groups) 214 | self.center_ = np.zeros(self.group_index_.shape) 215 | self.scale_ = np.zeros(self.group_index_.shape) 216 | is_df = hasattr(x, "columns") 217 | for g in self.group_index_: 218 | if is_df: 219 | self.center_[g] = np.median(x[self.groups_[g]]) 220 | self.scale_[g] = np.abs(np.quantile(x[self.groups_[g]], self.quartile_range[1] / 100.0) - 221 | np.quantile(x[self.groups_[g]], self.quartile_range[0] / 100.0)) 222 | else: 223 | self.center_[g] = np.median(x[:, self.groups_[g]]) 224 | self.scale_[g] = np.abs(np.quantile(x[:, self.groups_[g]], self.quartile_range[1] / 100.0) - 225 | np.quantile(x[:, self.groups_[g]], self.quartile_range[0] / 100.0)) 226 | 227 | def _transform_column(self, x_column, group_index): 228 | return (x_column - self.center_[group_index]) / self.scale_[group_index] 229 | 230 | def _inverse_transform_column(self, x_column, group_index): 231 | return x_column * self.scale_[group_index] + self.center_[group_index] 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | -------------------------------------------------------------------------------- /bridgescaler/tests/distributed_test.py: -------------------------------------------------------------------------------- 1 | from bridgescaler.distributed import DStandardScaler, DMinMaxScaler, DQuantileScaler 2 | from bridgescaler import save_scaler, load_scaler, print_scaler, read_scaler 3 | import numpy as np 4 | import pandas as pd 5 | import xarray as xr 6 | import os 7 | from multiprocessing import Pool 8 | 9 | def make_test_data(): 10 | np.random.seed(34325) 11 | test_data = dict() 12 | col_names = ["a", "b", "c", "d", "e"] 13 | test_data["means"] = np.array([0, 5.3, -2.421, 21456.3, 1.e-5]) 14 | test_data["sds"] = np.array([5, 352.2, 1e-4, 20000.3, 5.3e-2]) 15 | test_data["n_examples"] = np.array([1000, 500, 88]) 16 | test_data["numpy_2d"] = [] 17 | test_data["numpy_4d"] = [] 18 | test_data["pandas"] = [] 19 | test_data["xarray"] = [] 20 | tile_width = 5 21 | for n in range(test_data["n_examples"].size): 22 | data2d = np.zeros((test_data["n_examples"][n], test_data["means"].size)) 23 | data4d = np.zeros((test_data["n_examples"][n], tile_width, tile_width, test_data["means"].size)) 24 | for i in range(test_data["means"].size): 25 | data2d[:, i] = np.random.normal(loc=test_data["means"][i], 26 | scale=test_data["sds"][i], 27 | size=test_data["n_examples"][n]) 28 | data4d[..., i] = np.random.normal(loc=test_data["means"][i], 29 | scale=test_data["sds"][i], 30 | size=(test_data["n_examples"][n], tile_width, tile_width)) 31 | test_data["numpy_2d"].append(data2d) 32 | test_data["numpy_4d"].append(data4d) 33 | test_data["pandas"].append(pd.DataFrame(data2d, columns=col_names, index=np.arange(data2d.shape[0]))) 34 | test_data["xarray"].append(xr.DataArray(data4d, 35 | dims=("batch", "y", "x", "variable"), 36 | coords=dict(batch=np.arange(test_data["n_examples"][n]), 37 | y=np.arange(tile_width), 38 | x=np.arange(tile_width), 39 | variable=col_names))) 40 | 41 | return test_data 42 | 43 | 44 | # Create test datasets for use in all unit tests. 45 | test_data = make_test_data() 46 | 47 | def test_dstandard_scaler(): 48 | all_ds_2d = np.vstack(test_data["numpy_2d"]) 49 | all_ds_4d = np.vstack(test_data["numpy_4d"]) 50 | dsses_2d = [] 51 | dsses_4d = [] 52 | for n in range(test_data["n_examples"].size): 53 | dsses_2d.append(DStandardScaler()) 54 | dsses_2d[-1].fit(test_data["numpy_2d"][n]) 55 | dsses_4d.append(DStandardScaler(channels_last=True)) 56 | dsses_4d[-1].fit(test_data["numpy_4d"][n]) 57 | save_scaler(dsses_2d[-1], "scaler.json") 58 | new_scaler = load_scaler("scaler.json") 59 | os.remove("scaler.json") 60 | pd_dss = DStandardScaler() 61 | pd_trans = pd_dss.fit_transform(test_data["pandas"][0]) 62 | pd_inv_trans = pd_dss.inverse_transform(pd_trans) 63 | assert type(pd_trans) is type(test_data["pandas"][0]), "Pandas DataFrame type not passed through transform" 64 | assert type(pd_inv_trans) is type(test_data["pandas"][0]), "Pandas DataFrame type not passed through inverse" 65 | xr_dss = DStandardScaler() 66 | xr_trans = xr_dss.fit_transform(test_data["xarray"][0]) 67 | xr_inv_trans = xr_dss.inverse_transform(xr_trans) 68 | assert type(xr_trans) is type(test_data["xarray"][0]), "Pandas DataFrame type not passed through transform" 69 | assert type(xr_inv_trans) is type(test_data["xarray"][0]), "Pandas DataFrame type not passed through inverse" 70 | dss_total_2d = np.sum(dsses_2d) 71 | dss_total_4d = np.sum(dsses_4d) 72 | mean_2d, var_2d = dss_total_2d.get_scales() 73 | mean_4d, var_4d = dss_total_4d.get_scales() 74 | all_2d_var = all_ds_2d.var(axis=0) 75 | all_4d_var = np.array([all_ds_4d[..., i].var() for i in range(all_ds_4d.shape[-1])]) 76 | all_4d_mean = np.array([all_ds_4d[..., i].mean() for i in range(all_ds_4d.shape[-1])]) 77 | assert mean_2d.shape[0] == test_data["means"].shape[0] and var_2d.shape[0] == test_data["sds"].shape[0], "Stat shape mismatch" 78 | assert mean_4d.shape[0] == test_data["means"].shape[0] and var_4d.shape[0] == test_data["sds"].shape[0], "Stat shape mismatch" 79 | assert np.max(np.abs(mean_2d - all_ds_2d.mean(axis=0))) < 1e-5, "significant difference in means" 80 | assert np.max(np.abs(var_2d - all_2d_var) / all_2d_var) < 1e-5, "significant difference in variances" 81 | assert np.max(np.abs(mean_4d - all_4d_mean) / all_4d_mean) < 1e-5, "significant difference in means" 82 | assert np.max(np.abs(var_4d - all_4d_var) / all_4d_var) < 1e-5, "significant difference in variances" 83 | sub_cols = ["d", "b"] 84 | pd_sub_trans = pd_dss.transform(test_data["pandas"][0][sub_cols]) 85 | assert pd_sub_trans.shape[1] == len(sub_cols), "Did not subset properly" 86 | pd_sub_inv_trans = pd_dss.inverse_transform(pd_sub_trans) 87 | assert pd_sub_inv_trans.shape[1] == len(sub_cols), "Did not subset properly on inverse." 88 | 89 | 90 | def test_dminmax_scaler(): 91 | all_ds_2d = np.vstack(test_data["numpy_2d"]) 92 | dsses_2d = [] 93 | dsses_4d = [] 94 | for n in range(test_data["n_examples"].size): 95 | dsses_2d.append(DMinMaxScaler()) 96 | dsses_2d[-1].fit(test_data["numpy_2d"][n]) 97 | dsses_4d.append(DMinMaxScaler()) 98 | dsses_4d[-1].fit(test_data["numpy_4d"][n]) 99 | save_scaler(dsses_2d[-1], "scaler.json") 100 | new_scaler = load_scaler("scaler.json") 101 | os.remove("scaler.json") 102 | dss_total_2d = np.sum(dsses_2d) 103 | dss_total_4d = np.sum(dsses_4d) 104 | min_2d, max_2d = dss_total_2d.get_scales() 105 | min_4d, max_4d = dss_total_4d.get_scales() 106 | n_cols = test_data["numpy_2d"][0].shape[1] 107 | pd_dss = DMinMaxScaler() 108 | pd_trans = pd_dss.fit_transform(test_data["pandas"][0]) 109 | pd_inv_trans = pd_dss.inverse_transform(pd_trans) 110 | sub_cols = ["d", "b"] 111 | pd_sub_trans = pd_dss.transform(test_data["pandas"][0][sub_cols]) 112 | assert pd_sub_trans.shape[1] == len(sub_cols), "Did not subset properly" 113 | pd_sub_inv_trans = pd_dss.inverse_transform(pd_sub_trans) 114 | assert pd_sub_inv_trans.shape[1] == len(sub_cols), "Did not subset properly on inverse." 115 | assert type(pd_trans) is type(test_data["pandas"][0]), "Pandas DataFrame type not passed through transform" 116 | assert type(pd_inv_trans) is type(test_data["pandas"][0]), "Pandas DataFrame type not passed through inverse" 117 | xr_dss = DMinMaxScaler() 118 | xr_trans = xr_dss.fit_transform(test_data["xarray"][0]) 119 | xr_inv_trans = xr_dss.inverse_transform(xr_trans) 120 | assert type(xr_trans) is type(test_data["xarray"][0]), "Pandas DataFrame type not passed through transform" 121 | assert type(xr_inv_trans) is type(test_data["xarray"][0]), "Pandas DataFrame type not passed through inverse" 122 | assert min_2d.shape[0] == n_cols and max_2d.shape[0] == n_cols, "Stat shape mismatch" 123 | assert min_4d.shape[0] == n_cols and max_4d.shape[0] == n_cols, "Stat shape mismatch" 124 | assert np.max(np.abs(min_2d - all_ds_2d.min(axis=0))) < 1e-8, "significant difference in means" 125 | assert np.max(np.abs(max_2d - all_ds_2d.max(axis=0))) < 1e-8, "significant difference in variances" 126 | 127 | 128 | def test_dquantile_scaler(): 129 | dsses_2d = [] 130 | dsses_4d = [] 131 | pool = Pool(2) 132 | for n in range(test_data["n_examples"].size): 133 | dsses_2d.append(DQuantileScaler()) 134 | dsses_2d[-1].fit(test_data["numpy_2d"][n]) 135 | dsses_4d.append(DQuantileScaler()) 136 | dsses_4d[-1].fit(test_data["numpy_4d"][n]) 137 | ds_2d_transformed = dsses_2d[-1].transform(test_data["numpy_2d"][n], pool=pool) 138 | ds_4d_transformed = dsses_4d[-1].transform(test_data["numpy_4d"][n], pool=pool) 139 | ds_2d_it = dsses_2d[-1].inverse_transform(ds_2d_transformed, pool=pool) 140 | ds_4d_it = dsses_4d[-1].inverse_transform(ds_4d_transformed, pool=pool) 141 | assert ds_2d_transformed.max() <= 1, "Quantile transform > 1" 142 | assert ds_4d_transformed.max() <= 1, "Quantile transform > 1" 143 | save_scaler(dsses_2d[-1], "scaler.json") 144 | new_scaler = load_scaler("scaler.json") 145 | os.remove("scaler.json") 146 | assert np.nanargmax(np.abs((new_scaler.min_ - dsses_2d[-1].min_))) == 0, \ 147 | "Differences in scaler centroid values after loading" 148 | pd_dss = DQuantileScaler() 149 | pd_trans = pd_dss.fit_transform(test_data["pandas"][0], pool=pool) 150 | pd_inv_trans = pd_dss.inverse_transform(pd_trans, pool=pool) 151 | sub_cols = ["d", "b"] 152 | pd_sub_trans = pd_dss.transform(test_data["pandas"][0][sub_cols], pool=pool) 153 | assert pd_sub_trans.shape[1] == len(sub_cols), "Did not subset properly" 154 | pd_sub_inv_trans = pd_dss.inverse_transform(pd_sub_trans, pool=pool) 155 | assert pd_sub_inv_trans.shape[1] == len(sub_cols), "Did not subset properly on inverse." 156 | assert type(pd_trans) is type(test_data["pandas"][0]), "Pandas DataFrame type not passed through transform" 157 | assert type(pd_inv_trans) is type(test_data["pandas"][0]), "Pandas DataFrame type not passed through inverse" 158 | xr_dss = DQuantileScaler(distribution="normal") 159 | xr_trans = xr_dss.fit_transform(test_data["xarray"][0], pool=pool) 160 | xr_inv_trans = xr_dss.inverse_transform(xr_trans, pool=pool) 161 | assert np.all(~np.isnan(xr_trans)), "nans in transform" 162 | assert np.all(~np.isnan(xr_inv_trans)), "nans in inverse transform" 163 | assert xr_trans.shape == test_data["xarray"][0].shape, "shape does not match" 164 | assert xr_inv_trans.shape == test_data["xarray"][0].shape, "shape does not match" 165 | 166 | # assert np.max(np.abs(xr_inv_trans.values - test_data["xarray"][0].values)) < 1e-3, "Differences in transform" 167 | combined_scaler = np.sum(dsses_2d) 168 | assert combined_scaler.size_[0] == test_data["n_examples"].sum(), \ 169 | "Summing did not work properly." 170 | test_data_c_first = test_data["xarray"][0].transpose("batch", "variable", "y", "x").astype("float32") 171 | xr_dss_first = xr_dss.transform(test_data_c_first, channels_last=False, pool=pool) 172 | xr_inv_dss_first = xr_dss.inverse_transform(xr_dss_first, channels_last=False, pool=pool) 173 | assert xr_dss_first.shape == xr_inv_dss_first.shape, "shape does not match" 174 | xr_dss_f = DQuantileScaler(distribution="normal", channels_last=False) 175 | xr_dss_f.fit(test_data_c_first) 176 | scaled_data_quantile_first = xr_dss_f.transform(test_data_c_first, pool=pool) 177 | assert scaled_data_quantile_first.shape == test_data_c_first.shape 178 | if pool is not None: 179 | pool.close() 180 | pool.join() 181 | return 182 | 183 | if __name__ == "__main__": 184 | from time import perf_counter 185 | start = perf_counter() 186 | test_data_c_first = test_data["xarray"][0].transpose("batch", "variable", "y", "x").astype("float32") 187 | xr_dss_f = DQuantileScaler(distribution="normal", channels_last=False) 188 | xr_dss_f.fit(test_data_c_first, n_jobs=16) 189 | scaled_data_quantile_first = xr_dss_f.transform(test_data_c_first, n_jobs=16) 190 | stop = perf_counter() 191 | print(stop - start) 192 | -------------------------------------------------------------------------------- /bridgescaler/distributed_tensor.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import importlib.util 3 | 4 | from packaging import version 5 | import torch 6 | 7 | REQUIRED_VERSION = "2.0.0" # required torch version 8 | 9 | # Check if PyTorch is installed 10 | if importlib.util.find_spec("torch") is None: 11 | raise ImportError("PyTorch is not installed") 12 | 13 | installed_version = torch.__version__ 14 | 15 | # Validate version 16 | if version.parse(installed_version) < version.parse(REQUIRED_VERSION): 17 | raise RuntimeError( 18 | f"PyTorch version mismatch: required {REQUIRED_VERSION}, " 19 | f"found {installed_version}" 20 | ) 21 | 22 | 23 | class DBaseScalerTensor: 24 | """ 25 | Base distributed scaler class for tensor. Used only to store attributes and methods shared across all distributed 26 | scaler subclasses. 27 | """ 28 | 29 | def __init__(self, channels_last=True): 30 | self.x_columns_ = None 31 | self._fit = False 32 | self.channels_last = channels_last 33 | 34 | def is_fit(self): 35 | return self._fit 36 | 37 | @staticmethod 38 | def extract_x_columns(x, channels_last=True): 39 | """ 40 | Extract column indices to be transformed from x. All of these assume that the columns are in the last dimension. 41 | 42 | Args: 43 | x (torch.tensor): tensor of values to be transformed. 44 | channels_last (bool): If True, then assume the variable or channel dimension is the last dimension of the 45 | array. If False, then assume the variable or channel dimension is second. 46 | 47 | Returns: 48 | x_columns (torch.tensor): tensor of column indices. 49 | """ 50 | var_dim_num = -1 51 | if not channels_last: 52 | var_dim_num = 1 53 | assert isinstance(x, torch.Tensor), "Input must be a PyTorch tensor" 54 | x_columns = torch.arange(x.shape[var_dim_num]) 55 | return x_columns 56 | 57 | def set_channel_dim(self, channels_last=None): 58 | if channels_last is None: 59 | channels_last = self.channels_last 60 | if channels_last: 61 | channel_dim = -1 62 | else: 63 | channel_dim = 1 64 | return channel_dim 65 | 66 | def process_x_for_transform(self, x, channels_last=None): 67 | if channels_last is None: 68 | channels_last = self.channels_last 69 | channel_dim = self.set_channel_dim(channels_last) 70 | assert self._fit, "Scaler has not been fit." 71 | assert ( 72 | x.shape[channel_dim] == self.x_columns_.shape[0] 73 | ), "Number of input columns does not match scaler." 74 | x_col_order = torch.arange(x.shape[channel_dim]) 75 | xv = x 76 | x_transformed = torch.zeros(xv.shape, dtype=xv.dtype) 77 | return xv, x_transformed, channels_last, channel_dim, x_col_order 78 | 79 | def fit(self, x, weight=None): 80 | pass 81 | 82 | def transform(self, x, channels_last=None): 83 | pass 84 | 85 | def fit_transform(self, x, channels_last=None, weight=None): 86 | self.fit(x, weight=weight) 87 | return self.transform(x, channels_last=channels_last) 88 | 89 | def inverse_transform(self, x, channels_last=None): 90 | pass 91 | 92 | def __add__(self, other): 93 | pass 94 | 95 | def subset_columns(self, sel_columns): 96 | pass 97 | 98 | def add_variables(self, other): 99 | pass 100 | 101 | 102 | class DStandardScalerTensor(DBaseScalerTensor): 103 | """ 104 | Distributed version of StandardScaler. You can calculate this map-reduce style by running it on individual 105 | data files, returning the fitted objects, and then summing them together to represent the full dataset. Scaler 106 | supports torch.tensor and returns a transformed tensor. 107 | """ 108 | 109 | def __init__(self, channels_last=True): 110 | self.mean_x_ = None 111 | self.n_ = 0 112 | self.var_x_ = None 113 | super().__init__(channels_last=channels_last) 114 | 115 | def fit(self, x, weight=None): 116 | x_columns = self.extract_x_columns(x, channels_last=self.channels_last) 117 | xv = x 118 | channel_dim = self.set_channel_dim() 119 | if not self._fit: 120 | self.x_columns_ = x_columns 121 | if len(xv.shape) > 2: 122 | if self.channels_last: 123 | self.n_ += torch.prod(torch.tensor(xv.shape[:-1])) 124 | else: 125 | self.n_ += xv.shape[0] * \ 126 | torch.prod(torch.tensor(xv.shape[2:])) 127 | else: 128 | self.n_ += xv.shape[0] 129 | self.mean_x_ = torch.zeros(xv.shape[channel_dim], dtype=xv.dtype) 130 | self.var_x_ = torch.zeros(xv.shape[channel_dim], dtype=xv.dtype) 131 | 132 | if self.channels_last: 133 | for i in range(xv.shape[channel_dim]): 134 | self.mean_x_[i] = torch.mean(xv[..., i]) 135 | self.var_x_[i] = torch.var(xv[..., i], correction=0) 136 | else: 137 | for i in range(xv.shape[channel_dim]): 138 | self.mean_x_[i] = torch.mean(xv[:, i]) 139 | self.var_x_[i] = torch.var(xv[:, i], correction=0) 140 | 141 | else: 142 | # Update existing scaler with new data 143 | assert ( 144 | x.shape[channel_dim] == self.x_columns_.shape[0] 145 | ), "New data has a different number of columns" 146 | if self.channels_last: 147 | x_col_order = torch.arange(x.shape[-1]) 148 | else: 149 | x_col_order = torch.arange(x.shape[1]) 150 | if len(xv.shape) > 2: 151 | if self.channels_last: 152 | new_n = torch.prod(torch.tensor(xv.shape[:-1])) 153 | else: 154 | new_n = xv.shape[0] * \ 155 | torch.prod(torch.tensor(xv.shape[2:])) 156 | else: 157 | new_n = xv.shape[0] 158 | for i, o in enumerate(x_col_order): 159 | if self.channels_last: 160 | new_mean = torch.mean(xv[..., i]) 161 | new_var = torch.var(xv[..., i], correction=0) 162 | else: 163 | new_mean = torch.mean(xv[:, i]) 164 | new_var = torch.var(xv[:, i], correction=0) 165 | combined_mean = (self.n_ * self.mean_x_[o] + new_n * new_mean) / ( 166 | self.n_ + new_n 167 | ) 168 | weighted_var = (self.n_ * self.var_x_[o] + new_n * new_var) / ( 169 | self.n_ + new_n 170 | ) 171 | var_correction = ( 172 | self.n_ * new_n * (self.mean_x_[o] - new_mean) ** 2 173 | ) / ((self.n_ + new_n) ** 2) 174 | self.mean_x_[o] = combined_mean 175 | self.var_x_[o] = weighted_var + var_correction 176 | self.n_ += new_n 177 | self._fit = True 178 | 179 | def transform(self, x, channels_last=None): 180 | """ 181 | Transform the input data from its original form to standard scaled form. If your input data has a 182 | different dimension order than the data used to fit the scaler, use the channels_last keyword argument 183 | to specify whether the new data are `channels_last` (True) or `channels_first` (False). 184 | 185 | Args: 186 | x (torch.tensor): Input data. 187 | channels_last: Override the default channels_last parameter of the scaler. 188 | 189 | Returns: 190 | x_transformed (torch.tensor): Transformed data in the same shape and type as x. 191 | """ 192 | ( 193 | xv, 194 | x_transformed, 195 | channels_last, 196 | channel_dim, 197 | x_col_order, 198 | ) = self.process_x_for_transform(x, channels_last) 199 | x_mean, x_var = self.get_scales() 200 | if channels_last: 201 | for i, o in enumerate(x_col_order): 202 | x_transformed[..., i] = ( 203 | xv[..., i] - x_mean[o]) / torch.sqrt(x_var[o]) 204 | else: 205 | for i, o in enumerate(x_col_order): 206 | x_transformed[:, i] = ( 207 | xv[:, i] - x_mean[o]) / torch.sqrt(x_var[o]) 208 | return x_transformed 209 | 210 | def inverse_transform(self, x, channels_last=None): 211 | ( 212 | xv, 213 | x_transformed, 214 | channels_last, 215 | channel_dim, 216 | x_col_order, 217 | ) = self.process_x_for_transform(x, channels_last) 218 | x_mean, x_var = self.get_scales() 219 | if channels_last: 220 | for i, o in enumerate(x_col_order): 221 | x_transformed[..., i] = xv[..., i] * \ 222 | torch.sqrt(x_var[o]) + x_mean[o] 223 | else: 224 | for i, o in enumerate(x_col_order): 225 | x_transformed[:, i] = xv[:, i] * \ 226 | torch.sqrt(x_var[o]) + x_mean[o] 227 | return x_transformed 228 | 229 | def get_scales(self): 230 | return self.mean_x_, self.var_x_ 231 | 232 | def __add__(self, other): 233 | assert ( 234 | type(other) is DStandardScalerTensor 235 | ), "Input is not DStandardScalerTensor" 236 | assert torch.all( 237 | other.x_columns_ == self.x_columns_ 238 | ), "Scaler columns do not match." 239 | current = deepcopy(self) 240 | current.mean_x_ = (self.n_ * self.mean_x_ + other.n_ * other.mean_x_) / ( 241 | self.n_ + other.n_ 242 | ) 243 | combined_var = (self.n_ * self.var_x_ + other.n_ * other.var_x_) / ( 244 | self.n_ + other.n_ 245 | ) 246 | combined_var_corr = ( 247 | self.n_ * other.n_ * (self.mean_x_ - other.mean_x_) ** 2 248 | ) / ((self.n_ + other.n_) ** 2) 249 | current.var_x_ = combined_var + combined_var_corr 250 | current.n_ = self.n_ + other.n_ 251 | return current 252 | 253 | 254 | class DMinMaxScalerTensor(DBaseScalerTensor): 255 | """ 256 | Distributed MinMaxScaler enables calculation of min and max of variables in datasets in parallel, then combining 257 | the mins and maxes as a reduction step. Scaler 258 | supports torch.tensor and will return a transformed array in the 259 | same form as the original with column or coordinate names preserved. 260 | """ 261 | 262 | def __init__(self, channels_last=True): 263 | self.max_x_ = None 264 | self.min_x_ = None 265 | super().__init__(channels_last=channels_last) 266 | 267 | def fit(self, x, weight=None): 268 | x_columns = self.extract_x_columns(x, channels_last=self.channels_last) 269 | xv = x 270 | channel_dim = self.set_channel_dim() 271 | if not self._fit: 272 | self.x_columns_ = x_columns 273 | self.max_x_ = torch.zeros(xv.shape[channel_dim], dtype=xv.dtype) 274 | self.min_x_ = torch.zeros(xv.shape[channel_dim], dtype=xv.dtype) 275 | 276 | if self.channels_last: 277 | for i in range(xv.shape[channel_dim]): 278 | self.max_x_[i] = torch.max(xv[..., i]) 279 | self.min_x_[i] = torch.min(xv[..., i]) 280 | else: 281 | for i in range(xv.shape[channel_dim]): 282 | self.max_x_[i] = torch.max(xv[:, i]) 283 | self.min_x_[i] = torch.min(xv[:, i]) 284 | else: 285 | # Update existing scaler with new data 286 | assert ( 287 | x.shape[channel_dim] == self.x_columns_.shape[0] 288 | ), "New data has a different number of columns" 289 | if self.channels_last: 290 | x_col_order = torch.arange(x.shape[-1]) 291 | else: 292 | x_col_order = torch.arange(x.shape[1]) 293 | if self.channels_last: 294 | for i, o in enumerate(x_col_order): 295 | self.max_x_[o] = torch.maximum( 296 | self.max_x_[o], torch.max(xv[..., i]) 297 | ) 298 | self.min_x_[o] = torch.minimum( 299 | self.min_x_[o], torch.min(xv[..., i]) 300 | ) 301 | else: 302 | for i, o in enumerate(xv.shape[channel_dim]): 303 | self.max_x_[o] = torch.maximum( 304 | self.max_x_[o], torch.max(xv[:, i])) 305 | self.min_x_[o] = torch.minimum( 306 | self.min_x_[o], torch.min(xv[:, i])) 307 | self._fit = True 308 | 309 | def transform(self, x, channels_last=None): 310 | ( 311 | xv, 312 | x_transformed, 313 | channels_last, 314 | channel_dim, 315 | x_col_order, 316 | ) = self.process_x_for_transform(x, channels_last) 317 | if channels_last: 318 | for i, o in enumerate(x_col_order): 319 | x_transformed[..., i] = (xv[..., i] - self.min_x_[o]) / ( 320 | self.max_x_[o] - self.min_x_[o] 321 | ) 322 | else: 323 | for i, o in enumerate(x_col_order): 324 | x_transformed[:, i] = (xv[:, i] - self.min_x_[o]) / ( 325 | self.max_x_[o] - self.min_x_[o] 326 | ) 327 | return x_transformed 328 | 329 | def inverse_transform(self, x, channels_last=None): 330 | ( 331 | xv, 332 | x_transformed, 333 | channels_last, 334 | channel_dim, 335 | x_col_order, 336 | ) = self.process_x_for_transform(x, channels_last) 337 | if channels_last: 338 | for i, o in enumerate(x_col_order): 339 | x_transformed[..., i] = ( 340 | xv[..., i] * (self.max_x_[o] - self.min_x_[o] 341 | ) + self.min_x_[o] 342 | ) 343 | else: 344 | for i, o in enumerate(x_col_order): 345 | x_transformed[:, i] = ( 346 | xv[:, i] * (self.max_x_[o] - self.min_x_[o]) + 347 | self.min_x_[o] 348 | ) 349 | return x_transformed 350 | 351 | def get_scales(self): 352 | return self.min_x_, self.max_x_ 353 | 354 | def __add__(self, other): 355 | assert type(other) is DMinMaxScalerTensor, "Input is not DMinMaxScaler" 356 | assert torch.all( 357 | other.x_columns_ == self.x_columns_ 358 | ), "Scaler columns do not match." 359 | current = deepcopy(self) 360 | current.max_x_ = torch.maximum(self.max_x_, other.max_x_) 361 | current.min_x_ = torch.minimum(self.min_x_, other.min_x_) 362 | return current 363 | -------------------------------------------------------------------------------- /notebooks/Bridgscaler_intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "63310dd8-a974-4bcb-8d91-2250077b548e", 6 | "metadata": {}, 7 | "source": [ 8 | "# Bridgescaler Introduction" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "32c43d07-5af2-47c3-9656-fc11f640f66c", 14 | "metadata": {}, 15 | "source": [ 16 | "This is a short notebook covering some of the features and use cases of bridgescaler. The main repoistory can be found [here](https://github.com/NCAR/bridgescaler).\n", 17 | "\n", 18 | "Bridgescaler is desigend to add some functionality to scikit-learn pre-processors. " 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "2d0b7dda-e7af-4fab-9069-2690c860f0f1", 24 | "metadata": {}, 25 | "source": [ 26 | "#### Install" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 1, 32 | "id": "7848129f-d7b5-48d3-8da4-6e5b09026777", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# uncomment the below line if you need to install 2\n", 37 | "# !pip install bridgescaler" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "9b07b11c-8981-4e58-a8fa-a06f07cca893", 43 | "metadata": {}, 44 | "source": [ 45 | "## Imports" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "id": "838e6400-b2b9-43d0-8a31-844dba199f7f", 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "import numpy as np\n", 56 | "import pandas as pd\n", 57 | "import matplotlib.pyplot as plt\n", 58 | "\n", 59 | "import bridgescaler \n", 60 | "from bridgescaler import save_scaler, load_scaler\n", 61 | "from bridgescaler.group import GroupStandardScaler\n", 62 | "from sklearn.preprocessing import QuantileTransformer" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "id": "36dd5aaf-1555-46a0-b0eb-76546d72c63c", 68 | "metadata": {}, 69 | "source": [ 70 | "## Numpy Example" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 3, 76 | "id": "634f7659-21c1-4f96-ab3c-237ea85ee588", 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# Define mean and standard deviation\n", 81 | "mean = 500 # This is approximately the midpoint of 10 and 1000\n", 82 | "std_dev = 300 # This value is chosen to spread values roughly within the desired range\n", 83 | "\n", 84 | "# Generate the array\n", 85 | "gaussian_array = np.random.normal(mean, std_dev, 10000)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 4, 91 | "id": "f7be8c82-9f58-4835-8f95-5606b7ac6850", 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | "Remaining array shape: (8000,)\n", 99 | "Random samples shape: (2000,)\n" 100 | ] 101 | }, 102 | { 103 | "data": { 104 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAigAAAGdCAYAAAA44ojeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/H5lhTAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAzCklEQVR4nO3deVhV5aLH8d9WBhFxCyJsOCLi7CMOpSeVyjEHTmpF9zh0jmmZN3MoUrPMa1KPOZ1EKzMbzKlMvaXVPZqKj/OUSnpzqqMpDiVShqBmjOv+4XUdNyAKbtgL+H6eZz0Pa613rfWu1y37x7vetZbNMAxDAAAAFlLJ3RUAAADIi4ACAAAsh4ACAAAsh4ACAAAsh4ACAAAsh4ACAAAsh4ACAAAsh4ACAAAsx8PdFSiO3Nxc/fzzz/Lz85PNZnN3dQAAwG0wDEOXLl1SaGioKlUqvI+kTAaUn3/+WWFhYe6uBgAAKIYzZ86odu3ahZYpkwHFz89P0rUTrF69uptrAwAAbkd6errCwsLM7/HClMmAcv2yTvXq1QkoAACUMbczPINBsgAAwHIIKAAAwHIIKAAAwHLK5BgUAMA1OTk5ysrKcnc1AJOnp6cqV658x/shoABAGXX58mWdPXtWhmG4uyqAyWazqXbt2qpWrdod7YeAAgBlUE5Ojs6ePauqVauqVq1aPLQSlmAYhn755RedPXtWDRs2vKOeFAIKAJRBWVlZMgxDtWrVko+Pj7urA5hq1aqlpKQkZWVl3VFAYZAsAJRh9JzAalz1mSSgAAAAyyGgAAAqrMGDB+vhhx8u0jY2m01ffPFFidQH/8YYFAAoR+q+tLpUj5c07cFSPZ6rvfnmm0W+C+rcuXPy9/cvoRrhOgIKAMBtMjMz5eXl5bbj2+32Im/jcDhKoCZ3LisrS56enre93Oq4xAMAKDWdOnXSyJEjNXr0aAUGBqpbt26SpCNHjugvf/mLqlWrpuDgYA0cOFC//vqr03ajRo1SbGys/P39FRwcrPfff19XrlzRE088IT8/P9WvX19ff/21uU1OTo6GDBmiiIgI+fj4qHHjxnrzzTed6pP3Ek+nTp307LPPaty4cQoICJDD4VBcXJzTNjde4klKSpLNZtPKlSvVuXNnVa1aVS1bttSuXbuctvnggw8UFhamqlWr6pFHHlF8fLxq1KhRaFu9+OKLatSokapWrap69epp4sSJTg/li4uLU6tWrfTRRx+pXr168vb2lmEYstlsmjdvnh566CH5+vpq8uTJt2yLrVu3ytPTU8nJyU51GDNmjDp06FBoPUsKAQUAUKoWLVokDw8P7dixQ++9957OnTunjh07qlWrVtq3b5/Wrl2r8+fPq2/fvvm2CwwM1J49ezRq1Cg988wz+utf/6qoqCh9++236tGjhwYOHKjff/9dkpSbm6vatWtrxYoVOnLkiF555RW9/PLLWrFixS3r5+vrq2+++UYzZszQa6+9poSEhEK3mTBhgsaOHasDBw6oUaNGGjBggLKzsyVJO3bs0LBhw/Tcc8/pwIED6tatm15//fVbtpOfn58WLlyoI0eO6M0339QHH3ygWbNmOZU5fvy4VqxYoc8//1wHDhwwl0+aNEkPPfSQDh48qCeffPKWbdGhQwfVq1dPS5YsMfeRnZ2tjz/+WE888cQt61oSbEYZfARhenq67Ha70tLSVL16dXdXByjzChq3UNbHFpR3f/zxh06ePKmIiAhVqVLFXG71MSidOnVSWlqa9u/fby575ZVX9M0332jdunXmsrNnzyosLEw//PCDGjVqpE6dOiknJ0fbtm2TdK13xG63KyYmRosXL5YkJScnKyQkRLt27VK7du0KPP6IESN0/vx5ffbZZ5Ku9aBcvHjR7BHJexxJuueee9SlSxdNmzZN0rUelFWrVunhhx9WUlKSIiIi9OGHH2rIkCGSrvUGNWvWTEePHlWTJk3Uv39/Xb58Wf/85z/Nff7973/XP//5T128ePG22+4f//iHli9frn379km61oMyZcoU/fTTT6pVq5ZZzmazKTY2Nl+YuVVbzJgxwwxEkvTll1/q73//u5KTk+Xr63vb9bzZZ1Mq2vc3PSgAgFLVpk0bp/nExERt2rRJ1apVM6cmTZpIkn788UezXIsWLcyfK1eurJo1a6p58+bmsuDgYElSSkqKuWzevHlq06aNatWqpWrVqumDDz7Q6dOnC63fjceRpJCQEKd93mqbkJAQp3r88MMPuueee5zK550vyGeffab77rtPDodD1apV08SJE/PVPTw83CmcXJe3jaVbt8XgwYN1/Phx7d69W5L00UcfqW/fvkUKJ67EIFkAQKnK+4WXm5ur3r17a/r06fnKXv+yl5RvoKfNZnNadv0BYbm5uZKkFStW6Pnnn9fMmTPVvn17+fn56R//+Ie++eabQutX0HGu7/N2tslbj+vjQm50q4sXu3fvVv/+/fXqq6+qR48estvtWrZsmWbOnOlU7mbhIe/y22mLoKAg9e7dWwsWLFC9evW0Zs0abd68udB6liQCCgDAre6++259/vnnqlu3rjw8XPe1tG3bNkVFRWn48OHmsht7ZEpLkyZNtGfPHqdl1y/T3MyOHTsUHh6uCRMmmMtOnTpV7Drcbls89dRT6t+/v2rXrq369evr3nvvLfYx7xSXeAAAbjVixAj99ttvGjBggPbs2aMTJ05o/fr1evLJJ5WTk1Ps/TZo0ED79u3TunXr9K9//UsTJ07U3r17XVjz2zNq1CitWbNG8fHxOnbsmN577z19/fXXhT4SvkGDBjp9+rSWLVumH3/8UW+99ZZWrVpV7Drcbltc762ZPHmy2wbHXkdAAQC4VWhoqHbs2KGcnBz16NFDkZGReu6552S321WpUvG/poYNG6aYmBj169dPbdu21YULF5x6EErLvffeq3nz5ik+Pl4tW7bU2rVr9fzzz+cbQHqjhx56SM8//7xGjhypVq1aaefOnZo4cWKx63C7bVGpUiUNHjxYOTk5evzxx4t9PFfgLh4A3MVTBhV2pwSsb+jQofr++++d7hayiqFDh+r8+fP66quvirW9q+7iYQwKAAAl7I033lC3bt3k6+urr7/+WosWLdLcuXPdXS0naWlp2rt3rz755BN9+eWX7q4OAQUAgJK2Z88ezZgxQ5cuXVK9evX01ltv6amnnnJ3tZw89NBD2rNnj55++mnzCb/uREABAKCE3erptVbgzluKC8IgWQAAYDkEFAAAYDlc4gFQovLeIcTdQQBuBz0oAADAcggoAADAcggoAADAcggoAIBya/DgwXr44YfdXQ3L2Lx5s2w2my5evHhH+6lbt65mz57tkjrdDINkAaA8ibOX8vHSSvd4qDDoQQEAuE1mZqa7qwCLIqAAAEpNp06dNHLkSI0ePVqBgYHmI9Xj4+PVvHlz+fr6KiwsTMOHD9fly5fN7RYuXKgaNWpo3bp1atq0qapVq6aePXvq3LlzZpmcnByNHj1aNWrUUM2aNTVu3DjlfR9uRkaGnn32WQUFBalKlSq67777tHfvXnP99Usg69at01133SUfHx916dJFKSkp+vrrr9W0aVNVr15dAwYM0O+//37T8zx16pR69+4tf39/+fr6qlmzZlqzZo1ZzyFDhigiIkI+Pj5q3Lix3nzzTaftr1+amjJlioKDg1WjRg29+uqrys7O1gsvvKCAgADVrl1bH330kblNUlKSbDabli1bpqioKFWpUkXNmjW75RNid+7cqQ4dOsjHx0dhYWF69tlndeXKFXN9SkqKevfuLR8fH0VEROiTTz4pdH+uQkABAJSqRYsWycPDQzt27NB7770nSapUqZLeeustHTp0SIsWLdLGjRs1btw4p+1+//13vfHGG1qyZIm2bt2q06dPa+zYseb6mTNn6qOPPtL8+fO1fft2/fbbb1q1apXTPsaNG6fPP/9cixYt0rfffqsGDRqoR48e+u2335zKxcXFac6cOdq5c6fOnDmjvn37avbs2Vq6dKlWr16thIQEvf322zc9xxEjRigjI0Nbt27VwYMHNX36dFWrVk2SlJubq9q1a2vFihU6cuSIXnnlFb388sv5Hoe/ceNG/fzzz9q6davi4+MVFxenXr16yd/fX998842GDRumYcOG6cyZM07bvfDCCxozZoz279+vqKgo9enTRxcuXCiwngcPHlSPHj0UExOj7777TsuXL9f27ds1cuRIs8zgwYOVlJSkjRs36rPPPtPcuXOVkpJy03N3FZuRN16WAUV5XTOAW8v7MDXJdQ9U40FtJeOmr7S3+BiUTp06KS0tTfv37y+03H//93/rmWee0a+//irpWg/KE088oePHj6t+/fqSpLlz5+q1115TcnKyJCk0NFTPPfecXnzxRUlSdna2IiIi1Lp1a33xxRe6cuWK/P39tXDhQj322GOSpKysLNWtW1exsbF64YUXtHnzZnXu3FkbNmxQ165dJUnTpk3T+PHj9eOPP6pevXqSpGHDhikpKUlr164tsP4tWrTQo48+qkmTJt1Wu4wYMULnz5/XZ599JulaKNi8ebNOnDihSpWu9SU0adJEQUFB2rp1q6RrPTF2u10ffvih+vfvr6SkJEVERGjatGn52mDUqFEaN26ceX6pqamqUaOGHn/8cfn4+JhBUZK2b9+ujh076sqVKzp9+rQaN26s3bt3q23btpKk77//Xk2bNtWsWbMUGxub71xu+tlU0b6/GSQLVEAFBRKgtLRp0ybfsk2bNmnKlCk6cuSI0tPTlZ2drT/++ENXrlyRr6+vJKlq1apmOJGkkJAQ8y/5tLQ0nTt3Tu3btzfXe3h4qE2bNuZlnh9//FFZWVm69957zTKenp665557dPToUaf6tGjRwvw5ODhYVatWNcPJ9WV79uy56Tk+++yzeuaZZ7R+/Xo98MADevTRR532OW/ePH344Yc6deqUrl69qszMTLVq1cppH82aNTPDyfVjRkZGmvOVK1dWzZo18/VmFNQGec/vusTERB0/ftzpso1hGMrNzdXJkyf1r3/9y9zHdU2aNFGNGjVueu6uwiUeAGVC3ZdWO00ou64HjutOnTqlv/zlL4qMjNTnn3+uxMREvfPOO5Ku9XBc5+np6bSdzWbLN8akMNfL2my2fMvzLrvxWDabrcBj5+bm3vRYTz31lE6cOKGBAwfq4MGDatOmjXlJaMWKFXr++ef15JNPav369Tpw4ICeeOKJfAOGCzpmUetxY7mC5Obm6umnn9aBAwfM6X//93917Ngx1a9f/6ZtVhoIKAAAt9q3b5+ys7M1c+ZMtWvXTo0aNdLPP/9cpH3Y7XaFhIRo9+7d5rLs7GwlJiaa8w0aNJCXl5e2b99uLsvKytK+ffvUtGnTOz+RPMLCwjRs2DCtXLlSY8aM0QcffCBJ2rZtm6KiojR8+HDdddddatCggX788UeXHbegNmjSpEmBZe+++24dPnxYDRo0yDd5eXmpadOmys7O1r59+8xtfvjhhzt+jsrt4BIPAMCt6tevr+zsbL399tvq3bu3duzYoXnz5hV5P88995ymTZumhg0bqmnTpoqPj3f6IvX19dUzzzxj3gVTp04dzZgxQ7///ruGDBniwjOSYmNjFR0drUaNGik1NVUbN240Q1CDBg20ePFirVu3ThEREVqyZIn27t2riIgIlxz7nXfeMdtg1qxZSk1N1ZNPPllg2RdffFHt2rXTiBEjNHToUPn6+uro0aPmIODGjRurZ8+eGjp0qN5//315eHgoNjZWPj4+LqlrYehBAQC4VatWrRQfH6/p06crMjJSn3zyiaZOnVrk/YwZM0aPP/64Bg8erPbt28vPz0+PPPKIU5lp06bp0Ucf1cCBA3X33Xfr+PHjWrdunfz9/V11OpKuDWAdMWKEmjZtqp49e6px48aaO3eupGsDbGNiYtSvXz+1bdtWFy5c0PDhw1127GnTpmn69Olq2bKltm3bpi+//FKBgYEFlm3RooW2bNmiY8eO6f7779ddd92liRMnKiQkxCyzYMEChYWFqWPHjoqJidF//ud/KigoyGX1vRnu4gEqoNsZw2G1u3i4G8hZYXdKoGK6fhfP/v378w24LU2uuouHHhQAAGA5BBQAAGA5DJIFAKAcqFu3bpFuu7Y6elAAAIDlEFAAAIDlEFAAoAwrT136KB9c9ZkkoABAGVS5cmVJyvd4dMDdrn8mr39Gi6tIg2SnTp2qlStX6vvvv5ePj4+ioqI0ffp0NW7c2CwzePBgLVq0yGm7tm3bOj16NyMjQ2PHjtWnn36qq1evqmvXrpo7d65q1659RycDoHQV59kkJfnm5IrEw8NDVatW1S+//CJPT0+nl8oB7pKbm6tffvlFVatWlYfHnd2HU6Stt2zZohEjRujPf/6zsrOzNWHCBHXv3l1HjhxxevlTz549tWDBAnPey8vLaT+xsbH6n//5Hy1btkw1a9bUmDFj1KtXLyUmJt5x4gKAisBmsykkJEQnT57UqVOn3F0dwFSpUiXVqVPnjl8wWKSAsnbtWqf5BQsWKCgoSImJierQoYO53NvbWw6Ho8B9pKWlaf78+VqyZIkeeOABSdLHH3+ssLAwbdiwQT169CjqOQBAheTl5aWGDRtymQeW4uXl5ZIevTvqf0lLS5MkBQQEOC3fvHmzgoKCVKNGDXXs2FGvv/66+dz+xMREZWVlqXv37mb50NBQRUZGaufOnQUGlIyMDGVkZJjz6enpd1JtACg3KlWqxKPuUS4VO+IYhqHRo0frvvvuU2RkpLk8Ojpan3zyiTZu3KiZM2dq79696tKlixkwkpOT5eXlle/FTMHBwUpOTi7wWFOnTpXdbjensLCw4lYbAACUAcXuQRk5cqS+++47bd++3Wl5v379zJ8jIyPVpk0bhYeHa/Xq1YqJibnp/gzDuOn1qvHjx2v06NHmfHp6OiEFAIByrFg9KKNGjdJXX32lTZs23fLOm5CQEIWHh+vYsWOSJIfDoczMTKWmpjqVS0lJUXBwcIH78Pb2VvXq1Z0mAABQfhWpB8UwDI0aNUqrVq3S5s2bFRERccttLly4oDNnzigkJESS1Lp1a3l6eiohIUF9+/aVJJ07d06HDh3SjBkzinEKAKyioFuIAaA4ihRQRowYoaVLl+rLL7+Un5+fOWbEbrfLx8dHly9fVlxcnB599FGFhIQoKSlJL7/8sgIDA/XII4+YZYcMGaIxY8aoZs2aCggI0NixY9W8eXPzrh4AAFCxFSmgvPvuu5KkTp06OS1fsGCBBg8erMqVK+vgwYNavHixLl68qJCQEHXu3FnLly+Xn5+fWX7WrFny8PBQ3759zQe1LVy4kGegAAAAScW4xFMYHx8frVu37pb7qVKlit5++229/fbbRTk8AACoIHg2MgAAsBwCCgAAsBwCCgAAsBwCCgAAsJw7excyAFhI3uewJE170E01AXCn6EEBAACWQw8KgALRGwHAnQgoANyOMAQgLy7xAAAAy6EHBcBt4UWAAEoTPSgAAMBy6EEBYDn01gCgBwUAAFgOAQUAAFgOl3gAlEm3cxmooDLcwgyUDfSgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAy/FwdwUAoDTVfWm103zStAfdVBMAhaEHBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA5vMwYsKu9bdyXevAug4qAHBQAAWA4BBQAAWA4BBQAAWA4BBQAAWA6DZIFyrqDBtgBgdfSgAAAAyyGgAAAAyyGgAAAAyylSQJk6dar+/Oc/y8/PT0FBQXr44Yf1ww8/OJUxDENxcXEKDQ2Vj4+POnXqpMOHDzuVycjI0KhRoxQYGChfX1/16dNHZ8+evfOzAQAA5UKRAsqWLVs0YsQI7d69WwkJCcrOzlb37t115coVs8yMGTMUHx+vOXPmaO/evXI4HOrWrZsuXbpklomNjdWqVau0bNkybd++XZcvX1avXr2Uk5PjujMDAABlVpHu4lm7dq3T/IIFCxQUFKTExER16NBBhmFo9uzZmjBhgmJiYiRJixYtUnBwsJYuXaqnn35aaWlpmj9/vpYsWaIHHnhAkvTxxx8rLCxMGzZsUI8ePVx0agAAoKy6ozEoaWlpkqSAgABJ0smTJ5WcnKzu3bubZby9vdWxY0ft3LlTkpSYmKisrCynMqGhoYqMjDTL5JWRkaH09HSnCQAAlF/FDiiGYWj06NG67777FBkZKUlKTk6WJAUHBzuVDQ4ONtclJyfLy8tL/v7+Ny2T19SpU2W3280pLCysuNUGAABlQLEDysiRI/Xdd9/p008/zbfOZrM5zRuGkW9ZXoWVGT9+vNLS0szpzJkzxa02AAAoA4oVUEaNGqWvvvpKmzZtUu3atc3lDodDkvL1hKSkpJi9Kg6HQ5mZmUpNTb1pmby8vb1VvXp1pwkAAJRfRQoohmFo5MiRWrlypTZu3KiIiAin9REREXI4HEpISDCXZWZmasuWLYqKipIktW7dWp6enk5lzp07p0OHDpllAABAxVaku3hGjBihpUuX6ssvv5Sfn5/ZU2K32+Xj4yObzabY2FhNmTJFDRs2VMOGDTVlyhRVrVpVjz32mFl2yJAhGjNmjGrWrKmAgACNHTtWzZs3N+/qAVB8vHsHQHlQpIDy7rvvSpI6derktHzBggUaPHiwJGncuHG6evWqhg8frtTUVLVt21br16+Xn5+fWX7WrFny8PBQ3759dfXqVXXt2lULFy5U5cqV7+xsAABAuWAzDMNwdyWKKj09XXa7XWlpaYxHQblVUE9I0rQHb1kGRZO3TQGUnKJ8f/MuHgAAYDkEFAAAYDkEFAAAYDkEFAAAYDlFuosHgHsxKNb1bmcwMoDSRw8KAACwHAIKAACwHAIKAACwHAIKAACwHAbJAsAtMJAWKH30oAAAAMshoAAAAMshoAAAAMthDAoA5MED8QD3owcFAABYDgEFAABYDgEFAABYDgEFAABYDgEFAABYDgEFAABYDgEFAABYDgEFAABYDgEFAABYDk+SBUpB3ieT8iZcACgcPSgAAMBy6EEB3KCgd73QqwIA/0YPCgAAsBx6UACL4A26APBv9KAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLIaAAAADLKXJA2bp1q3r37q3Q0FDZbDZ98cUXTusHDx4sm83mNLVr186pTEZGhkaNGqXAwED5+vqqT58+Onv27B2dCAAAKD+KHFCuXLmili1bas6cOTct07NnT507d86c1qxZ47Q+NjZWq1at0rJly7R9+3ZdvnxZvXr1Uk5OTtHPAAAAlDseRd0gOjpa0dHRhZbx9vaWw+EocF1aWprmz5+vJUuW6IEHHpAkffzxxwoLC9OGDRvUo0ePolYJAACUMyUyBmXz5s0KCgpSo0aNNHToUKWkpJjrEhMTlZWVpe7du5vLQkNDFRkZqZ07dxa4v4yMDKWnpztNAACg/HJ5QImOjtYnn3yijRs3aubMmdq7d6+6dOmijIwMSVJycrK8vLzk7+/vtF1wcLCSk5ML3OfUqVNlt9vNKSwszNXVBgAAFlLkSzy30q9fP/PnyMhItWnTRuHh4Vq9erViYmJuup1hGLLZbAWuGz9+vEaPHm3Op6enE1IAACjHSvw245CQEIWHh+vYsWOSJIfDoczMTKWmpjqVS0lJUXBwcIH78Pb2VvXq1Z0mAABQfpV4QLlw4YLOnDmjkJAQSVLr1q3l6emphIQEs8y5c+d06NAhRUVFlXR1AABAGVDkSzyXL1/W8ePHzfmTJ0/qwIEDCggIUEBAgOLi4vToo48qJCRESUlJevnllxUYGKhHHnlEkmS32zVkyBCNGTNGNWvWVEBAgMaOHavmzZubd/UAAICKrcgBZd++fercubM5f31syKBBg/Tuu+/q4MGDWrx4sS5evKiQkBB17txZy5cvl5+fn7nNrFmz5OHhob59++rq1avq2rWrFi5cqMqVK7vglAAAQFlnMwzDcHcliio9PV12u11paWmMR0GZUPel1e6uAlwsadqD7q4CUOYU5fubd/EAAADLIaAAAADLcflzUACgIsh72Y5LPoBr0YMCAAAsh4ACAAAsh4ACAAAsh4ACAAAsh0GyAOACBT3rhoGzQPHRgwIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACyHgAIAACzHw90VAMq6ui+tdppPmvagm2oCAOUHPSgAAMByCCgAAMByCCgAAMByCCgAAMByGCQLACWEAdRA8dGDAgAALIeAAgAALIeAAgAALIeAAgAALIdBskAhGOQIAO5BDwoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAc7uIBADfiTjGgYPSgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyyGgAAAAyylyQNm6dat69+6t0NBQ2Ww2ffHFF07rDcNQXFycQkND5ePjo06dOunw4cNOZTIyMjRq1CgFBgbK19dXffr00dmzZ+/oRACrqPvS6nwTAKBoihxQrly5opYtW2rOnDkFrp8xY4bi4+M1Z84c7d27Vw6HQ926ddOlS5fMMrGxsVq1apWWLVum7du36/Lly+rVq5dycnKKfyYAAKDcKPLLAqOjoxUdHV3gOsMwNHv2bE2YMEExMTGSpEWLFik4OFhLly7V008/rbS0NM2fP19LlizRAw88IEn6+OOPFRYWpg0bNqhHjx53cDoAAKA8cOkYlJMnTyo5OVndu3c3l3l7e6tjx47auXOnJCkxMVFZWVlOZUJDQxUZGWmWySsjI0Pp6elOEwAAKL9cGlCSk5MlScHBwU7Lg4ODzXXJycny8vKSv7//TcvkNXXqVNntdnMKCwtzZbUBAIDFlMhdPDabzWneMIx8y/IqrMz48eOVlpZmTmfOnHFZXQEAgPW4NKA4HA5JytcTkpKSYvaqOBwOZWZmKjU19aZl8vL29lb16tWdJgAAUH65NKBERETI4XAoISHBXJaZmaktW7YoKipKktS6dWt5eno6lTl37pwOHTpklgEAABVbke/iuXz5so4fP27Onzx5UgcOHFBAQIDq1Kmj2NhYTZkyRQ0bNlTDhg01ZcoUVa1aVY899pgkyW63a8iQIRozZoxq1qypgIAAjR07Vs2bNzfv6gGsimeaAEDpKHJA2bdvnzp37mzOjx49WpI0aNAgLVy4UOPGjdPVq1c1fPhwpaamqm3btlq/fr38/PzMbWbNmiUPDw/17dtXV69eVdeuXbVw4UJVrlzZBacEAADKOpthGIa7K1FU6enpstvtSktLYzwKShQ9JnClpGkP5luW9zNWUBmgvCjK9zfv4gEAAJZDQAEAAJZDQAEAAJZDQAEAAJZDQAEAAJZDQAEAAJZT5OegAACKh9vWgdtHDwoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcHnUPABZS0OPwk6Y96IaaAO5FDwoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcAgoAALAcHnUP/L+CHjEOAHAPAgoAWFze8My7eVARcIkHAABYDj0oqBC4fAMAZQs9KAAAwHIIKAAAwHK4xAOgQElVHnOar/vHUjfVBEBFRA8KAACwHHpQAKCc4vZklGX0oAAAAMuhBwVAsdw4RoXxKQBcjR4UAABgOQQUAABgOQQUAABgOYxBAYBygNc5oLwhoAAw5X04GwC4CwEF5RJ/TQJA2cYYFAAAYDn0oABAGUMPISoCelAAAIDl0IMCoFTxlmQAt4MeFAAAYDkEFAAAYDkuDyhxcXGy2WxOk8PhMNcbhqG4uDiFhobKx8dHnTp10uHDh11dDaDCSqrymNMEAGVRiYxBadasmTZs2GDOV65c2fx5xowZio+P18KFC9WoUSNNnjxZ3bp10w8//CA/P7+SqA6AcorxLED5VSIBxcPDw6nX5DrDMDR79mxNmDBBMTExkqRFixYpODhYS5cu1dNPP10S1QHw/0rrC91VxyGAABVXiYxBOXbsmEJDQxUREaH+/fvrxIkTkqSTJ08qOTlZ3bt3N8t6e3urY8eO2rlzZ0lUBQAAlEEu70Fp27atFi9erEaNGun8+fOaPHmyoqKidPjwYSUnJ0uSgoODnbYJDg7WqVOnbrrPjIwMZWRkmPPp6emurjYAALAQlweU6Oho8+fmzZurffv2ql+/vhYtWqR27dpJkmw2m9M2hmHkW3ajqVOn6tVXX3V1VQEUAQNuAZSmEn9Qm6+vr5o3b65jx47p4YcfliQlJycrJCTELJOSkpKvV+VG48eP1+jRo8359PR0hYWFlVidgbLkVsGhPAWL8nQuAApX4s9BycjI0NGjRxUSEqKIiAg5HA4lJCSY6zMzM7VlyxZFRUXddB/e3t6qXr260wQAeXF7NVB+uLwHZezYserdu7fq1KmjlJQUTZ48Wenp6Ro0aJBsNptiY2M1ZcoUNWzYUA0bNtSUKVNUtWpVPfYYv1CAiogwAaAgLg8oZ8+e1YABA/Trr7+qVq1aateunXbv3q3w8HBJ0rhx43T16lUNHz5cqampatu2rdavX88zUAAAgMnlAWXZsmWFrrfZbIqLi1NcXJyrDw0AAMoJ3mYMVGBcXgFgVQQUABUeT6wFrIeAgjKv7kur3V2FCo+emLIr7/+fpGkPuqkmgLMSv80YAACgqOhBAdyEywoAcHMEFADIg/AIuB8BBbCIG78U+UK8c7caF0MbF6ygMV2MS4E7MAYFAABYDgEFAABYDpd4AOAWuPwGlD4CCiyN6+EAUDERUAAL4i4SABUdAQVlTll9cixPW7UW/j0Aa2OQLAAAsBx6UGApZbV3BADgWgQUoAwqa5cnylp9yyv+AEBZQkABygC+4AFUNAQUAECh8va8cKs/SgODZAEAgOUQUAAAgOVwiQcA7gAP1QNKBgEFAErIrQY3E2aAmyOgAEAR3Cp0cMcV4BqMQQEAAJZDQAEAAJbDJR6gBNHdD1e58bPk7rErBT2RlmejwNUIKADgJlYKHYDVEFAAF6LHBABcgzEoAADAcuhBAQALKOsPfON9PXA1AgqQR1G+KLikAwAlg4ACtynoTgAAACQCCkoItyECd6Y89s5xGQhFwSBZAABgOQQUAABgOVziAW6hPHa1A4DVEVBQahgUC7hGWbglmf/vuFNc4gEAAJZDQAEAAJbDJR6USzxsDQDKNgIKKgRCCMqzsjAmpSC387wknqlUcRFQ4BIMiAMAuBIBBUVWmmHkxr8My8pfhQCKjz92cB2DZAEAgOXQgwIA5VhZHZ8C0IMCAAAshx6UCo63iwIArIgeFAAAYDn0oOCW3HXXDgBrKez/Z2mObaHnt2IgoFhMRXsoEYEEcL2i/L+ySugA8iKgoNS5KpQQboCi4//Nv9ETY20ElIoszq6kKv+eLcpfS4XdushtjUD5QJiBO7l1kOzcuXMVERGhKlWqqHXr1tq2bZs7qwMAACzCbT0oy5cvV2xsrObOnat7771X7733nqKjo3XkyBHVqVPHXdUq++Lsd7R5SQyI5ZIOUP65sueUV1xAcmNAiY+P15AhQ/TUU09JkmbPnq1169bp3Xff1dSpU91VrTLrerC48ZJNUXFpBkBpcPXvmuL8YZVU5bFiX+JG6XBLQMnMzFRiYqJeeuklp+Xdu3fXzp0785XPyMhQRkaGOZ+WliZJSk9PL5H6RU5a5zR/6NUepbaf3Izf8y3Le5559+tU1mbcZu1u7TvbgH8f84/5hR7nxrLpGQJQwd34O0Fy/h1S2O8Pyfl3SEG/E290qMqQ265TYXXIzfi9wO+UAn+PT63tXGj82UK3Kcjt7Kc0FVTn4n73FeZ6GxvGbXxXGW7w008/GZKMHTt2OC1//fXXjUaNGuUrP2nSJEMSExMTExMTUzmYzpw5c8us4Na7eGw2m9O8YRj5lknS+PHjNXr0aHM+NzdXv/32m2rWrFlg+bIqPT1dYWFhOnPmjKpXr+7u6pRbtHPpoa1LB+1cOmjnO2cYhi5duqTQ0NBblnVLQAkMDFTlypWVnJzstDwlJUXBwcH5ynt7e8vb29tpWY0aNUqyim5VvXp1PvylgHYuPbR16aCdSwftfGfsdvttlXPLbcZeXl5q3bq1EhISnJYnJCQoKirKHVUCAAAW4rZLPKNHj9bAgQPVpk0btW/fXu+//75Onz6tYcOGuatKAADAItwWUPr166cLFy7otdde07lz5xQZGak1a9YoPDzcXVVyO29vb02aNCnf5Sy4Fu1cemjr0kE7lw7auXTZDON27vUBAAAoPW591D0AAEBBCCgAAMByCCgAAMByCCgAAMByCChuULduXdlsNqcp73uJTp8+rd69e8vX11eBgYF69tlnlZmZ6VTm4MGD6tixo3x8fPSnP/1Jr7322u2936CCmzt3riIiIlSlShW1bt1a27Ztc3eVyoy4uLh8n12Hw2GuNwxDcXFxCg0NlY+Pjzp16qTDhw877SMjI0OjRo1SYGCgfH191adPH5096753kFjF1q1b1bt3b4WGhspms+mLL75wWu+qtk1NTdXAgQNlt9tlt9s1cOBAXbx4sYTPzjpu1c6DBw/O9xlv166dUxnauXQQUNzk+u3V16f/+q//Mtfl5OTowQcf1JUrV7R9+3YtW7ZMn3/+ucaMGWOWSU9PV7du3RQaGqq9e/fq7bff1htvvKH4+Hh3nE6ZsXz5csXGxmrChAnav3+/7r//fkVHR+v06dPurlqZ0axZM6fP7sGDB811M2bMUHx8vObMmaO9e/fK4XCoW7duunTpklkmNjZWq1at0rJly7R9+3ZdvnxZvXr1Uk5OjjtOxzKuXLmili1bas6cOQWud1XbPvbYYzpw4IDWrl2rtWvX6sCBAxo4cGCJn59V3KqdJalnz55On/E1a9Y4raedS8mdv/oPRRUeHm7MmjXrpuvXrFljVKpUyfjpp5/MZZ9++qnh7e1tpKWlGYZhGHPnzjXsdrvxxx9/mGWmTp1qhIaGGrm5uSVW97LunnvuMYYNG+a0rEmTJsZLL73kphqVLZMmTTJatmxZ4Lrc3FzD4XAY06ZNM5f98ccfht1uN+bNm2cYhmFcvHjR8PT0NJYtW2aW+emnn4xKlSoZa9euLdG6lyWSjFWrVpnzrmrbI0eOGJKM3bt3m2V27dplSDK+//77Ej4r68nbzoZhGIMGDTIeeuihm25DO5ceelDcZPr06apZs6ZatWql119/3enyza5duxQZGen0MqUePXooIyNDiYmJZpmOHTs6PTCoR48e+vnnn5WUlFRq51GWZGZmKjExUd27d3da3r17d+3cudNNtSp7jh07ptDQUEVERKh///46ceKEJOnkyZNKTk52al9vb2917NjRbN/ExERlZWU5lQkNDVVkZCT/BoVwVdvu2rVLdrtdbdu2Ncu0a9dOdrud9r/B5s2bFRQUpEaNGmno0KFKSUkx19HOpcetbzOuqJ577jndfffd8vf31549ezR+/HidPHlSH374oSQpOTk530sT/f395eXlZb5gMTk5WXXr1nUqc32b5ORkRURElPyJlDG//vqrcnJy8rVtcHBwvhdXomBt27bV4sWL1ahRI50/f16TJ09WVFSUDh8+bLZhQe176tQpSdc+m15eXvL3989Xhn+Dm3NV2yYnJysoKCjf/oOCgmj//xcdHa2//vWvCg8P18mTJzVx4kR16dJFiYmJ8vb2pp1LEQHFReLi4vTqq68WWmbv3r1q06aNnn/+eXNZixYt5O/vr//4j/8we1UkyWaz5dveMAyn5XnLGP8/QLagbfFvBbUbbXZ7oqOjzZ+bN2+u9u3bq379+lq0aJE5kLA47cu/we1xRdvezu+Wiqxfv37mz5GRkWrTpo3Cw8O1evVqxcTE3HQ72tn1uMTjIiNHjtTRo0cLnSIjIwvc9vov9uPHj0uSHA5HvpSdmpqqrKws8y+ogspc74bM+1cWrgkMDFTlypULbDfarHh8fX3VvHlzHTt2zLybp7D2dTgcyszMVGpq6k3LID9Xta3D4dD58+fz7f+XX36h/W8iJCRE4eHhOnbsmCTauTQRUFwkMDBQTZo0KXSqUqVKgdvu379f0rX/CJLUvn17HTp0SOfOnTPLrF+/Xt7e3mrdurVZZuvWrU5jV9avX6/Q0NB8l35wjZeXl1q3bq2EhASn5QkJCYqKinJTrcq2jIwMHT16VCEhIYqIiJDD4XBq38zMTG3ZssVs39atW8vT09OpzLlz53To0CH+DQrhqrZt37690tLStGfPHrPMN998o7S0NNr/Ji5cuKAzZ86Yv59p51LkrtG5FdXOnTuN+Ph4Y//+/caJEyeM5cuXG6GhoUafPn3MMtnZ2UZkZKTRtWtX49tvvzU2bNhg1K5d2xg5cqRZ5uLFi0ZwcLAxYMAA4+DBg8bKlSuN6tWrG2+88YY7TqvMWLZsmeHp6WnMnz/fOHLkiBEbG2v4+voaSUlJ7q5amTBmzBhj8+bNxokTJ4zdu3cbvXr1Mvz8/Mz2mzZtmmG3242VK1caBw8eNAYMGGCEhIQY6enp5j6GDRtm1K5d29iwYYPx7bffGl26dDFatmxpZGdnu+u0LOHSpUvG/v37jf379xuSzN8Tp06dMgzDdW3bs2dPo0WLFsauXbuMXbt2Gc2bNzd69epV6ufrLoW186VLl4wxY8YYO3fuNE6ePGls2rTJaN++vfGnP/2JdnYDAkopS0xMNNq2bWvY7XajSpUqRuPGjY1JkyYZV65ccSp36tQp48EHHzR8fHyMgIAAY+TIkU63FBuGYXz33XfG/fffb3h7exsOh8OIi4vjFuPb8M477xjh4eGGl5eXcffddxtbtmxxd5XKjH79+hkhISGGp6enERoaasTExBiHDx821+fm5hqTJk0yHA6H4e3tbXTo0ME4ePCg0z6uXr1qjBw50ggICDB8fHyMXr16GadPny7tU7GcTZs2GZLyTYMGDTIMw3Vte+HCBeNvf/ub4efnZ/j5+Rl/+9vfjNTU1FI6S/crrJ1///13o3v37katWrUMT09Po06dOsagQYPytSHtXDpshsGjRwEAgLUwBgUAAFgOAQUAAFgOAQUAAFgOAQUAAFgOAQUAAFgOAQUAAFgOAQUAAFgOAQUAAFgOAQUAAFgOAQUAAFgOAQUAAFgOAQUAAFjO/wHim9be76qbVAAAAABJRU5ErkJggg==", 105 | "text/plain": [ 106 | "
" 107 | ] 108 | }, 109 | "metadata": {}, 110 | "output_type": "display_data" 111 | } 112 | ], 113 | "source": [ 114 | "# Randomly choose 2000 indices\n", 115 | "indices_to_remove = np.random.choice(gaussian_array.size, 2000, replace=False)\n", 116 | "\n", 117 | "# Get the samples corresponding to the chosen indices\n", 118 | "random_samples = gaussian_array[indices_to_remove]\n", 119 | "\n", 120 | "# Remove the selected samples from the original array\n", 121 | "remaining_arr = np.delete(gaussian_array, indices_to_remove)\n", 122 | "\n", 123 | "plt.hist(remaining_arr, bins=100, label='remaining array');\n", 124 | "plt.hist(random_samples, bins=100, label='random sampled');\n", 125 | "plt.legend()\n", 126 | "\n", 127 | "print(\"Remaining array shape:\", remaining_arr.shape)\n", 128 | "print(\"Random samples shape:\", random_samples.shape)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "id": "8dd8f9f6-4673-4baf-9f80-ae14af502c48", 134 | "metadata": {}, 135 | "source": [ 136 | "#### Let's scale it!" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 5, 142 | "id": "37c21018-07cd-4e74-9656-cd5f7919eec9", 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "scaler = QuantileTransformer(n_quantiles = 74)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 6, 152 | "id": "085f3dd5-704d-4a64-88c2-e495faf8e5a4", 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "arr_transformed = scaler.fit_transform(remaining_arr.reshape(-1, 1))" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 7, 162 | "id": "e77ca417-db7b-4b86-856e-8ca0a5b28021", 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "filename = \"quantile_scaler.json\"" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 8, 172 | "id": "2a769941-850d-4379-a00a-4d6c7a60d4d1", 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "# save to json file\n", 177 | "save_scaler(scaler, filename)\n", 178 | "# create new StandardScaler from json file information.\n", 179 | "new_scaler = load_scaler(filename)\n", 180 | "# new_scaler is a StandardScaler object" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "id": "f4db109a-22ee-473f-a055-bc30645da34c", 186 | "metadata": {}, 187 | "source": [ 188 | "Let's load those parameters, and fit on the subsampled dataset from above:" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 9, 194 | "id": "8cbd215e-57eb-403a-8759-b0b8d8a04462", 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "random_transformed = new_scaler.fit_transform(random_samples.reshape(-1,1))" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "id": "857b3868-7082-49e1-9cbe-4da46fd59d32", 204 | "metadata": {}, 205 | "source": [ 206 | "We can also inverse transform to get close to the original array. The number of quantiles will have a large impact on this." 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 10, 212 | "id": "0bd88fdf-3ad5-4f69-b564-26676be298ec", 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "data": { 217 | "text/plain": [ 218 | "array([[386.35688039],\n", 219 | " [586.64628448],\n", 220 | " [155.9777724 ],\n", 221 | " ...,\n", 222 | " [296.04124775],\n", 223 | " [ 50.26756938],\n", 224 | " [457.81957258]])" 225 | ] 226 | }, 227 | "execution_count": 10, 228 | "metadata": {}, 229 | "output_type": "execute_result" 230 | } 231 | ], 232 | "source": [ 233 | "inverted_random = new_scaler.inverse_transform(random_transformed)\n", 234 | "inverted_random" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "id": "3da9bf26-2e80-4345-8f9f-74dd136d42b5", 240 | "metadata": {}, 241 | "source": [ 242 | "Let's see how far apart the medians are:" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 11, 248 | "id": "0a398609-b80d-4cd3-acf5-dc3bd0bf5b1e", 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "data": { 253 | "text/plain": [ 254 | "0.3066" 255 | ] 256 | }, 257 | "execution_count": 11, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "median_random = np.median(random_transformed)\n", 264 | "median_arr = np.median(arr_transformed)\n", 265 | "percentage_difference = np.round(np.abs(median_random - median_arr) / median_random * 100, 4)\n", 266 | "percentage_difference" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "id": "b6799f65-f898-4357-aef2-e9283e9ecc73", 272 | "metadata": {}, 273 | "source": [ 274 | "## Pandas Example" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "id": "0d328fee-9ad4-4245-8e33-6ef53c7213e4", 280 | "metadata": {}, 281 | "source": [ 282 | "This is using a [Group Scaler](https://bridgescaler.readthedocs.io/en/latest/group.html). " 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 12, 288 | "id": "89a78409-d918-427e-8321-cc2e3b277dc9", 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "data": { 293 | "text/html": [ 294 | "
\n", 295 | "\n", 308 | "\n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | "
abcde
count1000.0000001000.0000001000.0000001000.0000001000.000000
mean0.4931600.5082490.4823080.5003100.490955
std0.2874040.2909550.2833990.2860280.289448
min0.0002820.0004660.0002200.0017260.002557
25%0.2392520.2540530.2349610.2602310.238156
50%0.4897620.5283420.4793330.4984970.489239
75%0.7470750.7647940.7142800.7579060.743647
max0.9995660.9984530.9992720.9979660.999053
\n", 386 | "
" 387 | ], 388 | "text/plain": [ 389 | " a b c d e\n", 390 | "count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000\n", 391 | "mean 0.493160 0.508249 0.482308 0.500310 0.490955\n", 392 | "std 0.287404 0.290955 0.283399 0.286028 0.289448\n", 393 | "min 0.000282 0.000466 0.000220 0.001726 0.002557\n", 394 | "25% 0.239252 0.254053 0.234961 0.260231 0.238156\n", 395 | "50% 0.489762 0.528342 0.479333 0.498497 0.489239\n", 396 | "75% 0.747075 0.764794 0.714280 0.757906 0.743647\n", 397 | "max 0.999566 0.998453 0.999272 0.997966 0.999053" 398 | ] 399 | }, 400 | "execution_count": 12, 401 | "metadata": {}, 402 | "output_type": "execute_result" 403 | } 404 | ], 405 | "source": [ 406 | "x_rand = np.random.random(size=(1000, 5))\n", 407 | "data = pd.DataFrame(data=x_rand,\n", 408 | " columns=[\"a\", \"b\", \"c\", \"d\", \"e\"])\n", 409 | "data.describe()" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 13, 415 | "id": "f2366229-634e-463f-b989-efd216efb58a", 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "groups = [[\"a\", \"b\"], [\"c\", \"d\"], \"e\"]\n", 420 | "group_scaler = GroupStandardScaler()\n", 421 | "x_transformed = group_scaler.fit_transform(data, groups=groups)" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "id": "5c565379-6bfb-4b41-b0fe-0f6cf3907aec", 427 | "metadata": {}, 428 | "source": [ 429 | "We can save out this file like the above:" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 14, 435 | "id": "6d04a3af-7fe1-42de-b9b6-793b82870bea", 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "# save to json file\n", 440 | "save_scaler(group_scaler, 'group_scaler.json')" 441 | ] 442 | } 443 | ], 444 | "metadata": { 445 | "kernelspec": { 446 | "display_name": "Python [conda env:unidata-cpu]", 447 | "language": "python", 448 | "name": "conda-env-unidata-cpu-py" 449 | }, 450 | "language_info": { 451 | "codemirror_mode": { 452 | "name": "ipython", 453 | "version": 3 454 | }, 455 | "file_extension": ".py", 456 | "mimetype": "text/x-python", 457 | "name": "python", 458 | "nbconvert_exporter": "python", 459 | "pygments_lexer": "ipython3", 460 | "version": "3.10.0" 461 | } 462 | }, 463 | "nbformat": 4, 464 | "nbformat_minor": 5 465 | } 466 | -------------------------------------------------------------------------------- /bridgescaler/distributed.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.lib.recfunctions import structured_to_unstructured, unstructured_to_structured 3 | from copy import deepcopy 4 | from crick import TDigest as CTDigest 5 | from scipy.special import ndtr, ndtri 6 | import pandas as pd 7 | import xarray as xr 8 | from functools import partial 9 | from scipy.stats import logistic 10 | from numba import guvectorize, float32, float64, void 11 | CENTROID_DTYPE = np.dtype([('mean', np.float64), ('weight', np.float64)]) 12 | 13 | 14 | class DBaseScaler(object): 15 | """ 16 | Base distributed scaler class. Used only to store attributes and methods shared across all distributed 17 | scaler subclasses. 18 | """ 19 | def __init__(self, channels_last=True): 20 | self.x_columns_ = None 21 | self.is_array_ = False 22 | self._fit = False 23 | self.channels_last = channels_last 24 | 25 | def is_fit(self): 26 | return self._fit 27 | 28 | @staticmethod 29 | def extract_x_columns(x, channels_last=True): 30 | """ 31 | Extract the variable names to be transformed from x depending on if x is a pandas DataFrame, an 32 | xarray DataArray, or a numpy array. All of these assume that the columns are in the last dimension. 33 | If x is an xarray DataArray, there should be a coorindate variable with the same name as the last dimension 34 | of the DataArray being transformed. 35 | 36 | Args: 37 | x (Union[pandas.DataFrame, xarray.DataArray, numpy.ndarray]): array of values to be transformed. 38 | channels_last (bool): If True, then assume the variable or channel dimension is the last dimension of the 39 | array. If False, then assume the variable or channel dimension is second. 40 | 41 | Returns: 42 | xv (numpy.ndarray): Array of values to be transformed. 43 | is_array (bool): Whether or not x was a np.ndarray. 44 | """ 45 | is_array = False 46 | var_dim_num = -1 47 | if not channels_last: 48 | var_dim_num = 1 49 | if hasattr(x, "columns"): 50 | x_columns = x.columns.values 51 | elif hasattr(x, "coords"): 52 | var_dim = x.dims[var_dim_num] 53 | x_columns = x.coords[var_dim].values 54 | else: 55 | x_columns = np.arange(x.shape[var_dim_num]) 56 | is_array = True 57 | return x_columns, is_array 58 | 59 | @staticmethod 60 | def extract_array(x): 61 | if hasattr(x, "columns") or hasattr(x, "coords"): 62 | xv = x.values 63 | else: 64 | xv = x 65 | return xv 66 | 67 | def get_column_order(self, x_in_columns): 68 | """ 69 | Get the indices of the scaler columns that have the same name as the columns in the input x array. This 70 | enables users to pass a DataFrame or DataArray to transform or inverse_transform with fewer columns than 71 | the original scaler or columns in a different order and still have the input dataset be transformed properly. 72 | 73 | Args: 74 | x_in_columns (Union[list, numpy.ndarray]): list of input columns. 75 | 76 | Returns: 77 | x_in_col_indices (np.ndarray): indices of the input columns from x in the scaler in order. 78 | """ 79 | assert np.all(np.isin(x_in_columns, self.x_columns_)), "Some input columns not in scaler x_columns." 80 | x_in_col_indices = np.array([np.where(col == np.array(self.x_columns_))[0][0] for col in x_in_columns]) 81 | return x_in_col_indices 82 | 83 | @staticmethod 84 | def package_transformed_x(x_transformed, x): 85 | """ 86 | Repackaged a transformed numpy array into the same datatype as the original x, including 87 | all metadata. 88 | 89 | Args: 90 | x_transformed (numpy.ndarray): array after being transformed or inverse transformed 91 | x (Union[pandas.DataFrame, xarray.DataArray, numpy.ndarray]): 92 | 93 | Returns: 94 | 95 | """ 96 | if hasattr(x, "columns"): 97 | x_packaged = pd.DataFrame(x_transformed, index=x.index, columns=x.columns) 98 | elif hasattr(x, "coords"): 99 | x_packaged = xr.DataArray(x_transformed, coords=x.coords, dims=x.dims, attrs=x.attrs, name=x.name) 100 | else: 101 | x_packaged = x_transformed 102 | return x_packaged 103 | 104 | def set_channel_dim(self, channels_last=None): 105 | if channels_last is None: 106 | channels_last = self.channels_last 107 | if channels_last: 108 | channel_dim = -1 109 | else: 110 | channel_dim = 1 111 | return channel_dim 112 | 113 | def process_x_for_transform(self, x, channels_last=None): 114 | if channels_last is None: 115 | channels_last = self.channels_last 116 | channel_dim = self.set_channel_dim(channels_last) 117 | assert self._fit, "Scaler has not been fit." 118 | x_in_cols, is_array = self.extract_x_columns(x, channels_last=channels_last) 119 | if is_array: 120 | assert x.shape[channel_dim] == self.x_columns_.shape[0], "Number of input columns does not match scaler." 121 | x_col_order = np.arange(x.shape[channel_dim]) 122 | else: 123 | x_col_order = self.get_column_order(x_in_cols) 124 | xv = self.extract_array(x) 125 | x_transformed = np.zeros(xv.shape, dtype=xv.dtype) 126 | return xv, x_transformed, channels_last, channel_dim, x_col_order 127 | 128 | def fit(self, x, weight=None): 129 | pass 130 | 131 | def transform(self, x, channels_last=None): 132 | pass 133 | 134 | def fit_transform(self, x, channels_last=None, weight=None): 135 | self.fit(x, weight=weight) 136 | return self.transform(x, channels_last=channels_last) 137 | 138 | def inverse_transform(self, x, channels_last=None): 139 | pass 140 | 141 | def __add__(self, other): 142 | pass 143 | 144 | def subset_columns(self, sel_columns): 145 | pass 146 | 147 | def add_variables(self, other): 148 | pass 149 | 150 | 151 | class DStandardScaler(DBaseScaler): 152 | """ 153 | Distributed version of StandardScaler. You can calculate this map-reduce style by running it on individual 154 | data files, return the fitted objects, and then sum them together to represent the full dataset. Scaler 155 | supports numpy arrays, pandas dataframes, and xarray DataArrays and will return a transformed array in the 156 | same form as the original with column or coordinate names preserved. 157 | 158 | """ 159 | def __init__(self, channels_last=True): 160 | self.mean_x_ = None 161 | self.n_ = 0 162 | self.var_x_ = None 163 | super().__init__(channels_last=channels_last) 164 | 165 | def fit(self, x, weight=None): 166 | x_columns, is_array = self.extract_x_columns(x, channels_last=self.channels_last) 167 | xv = self.extract_array(x) 168 | channel_dim = self.set_channel_dim() 169 | if not self._fit: 170 | self.x_columns_ = x_columns 171 | self.is_array_ = is_array 172 | if len(xv.shape) > 2: 173 | if self.channels_last: 174 | self.n_ += np.prod(xv.shape[:-1]) 175 | else: 176 | self.n_ += xv.shape[0] * np.prod(xv.shape[2:]) 177 | else: 178 | self.n_ += xv.shape[0] 179 | self.mean_x_ = np.zeros(xv.shape[channel_dim], dtype=xv.dtype) 180 | self.var_x_ = np.zeros(xv.shape[channel_dim], dtype=xv.dtype) 181 | if self.channels_last: 182 | for i in range(xv.shape[channel_dim]): 183 | self.mean_x_[i] = np.nanmean(xv[..., i]) 184 | self.var_x_[i] = np.nanvar(xv[..., i]) 185 | else: 186 | for i in range(xv.shape[channel_dim]): 187 | self.mean_x_[i] = np.nanmean(xv[:, i]) 188 | self.var_x_[i] = np.nanvar(xv[:, i]) 189 | 190 | else: 191 | assert x.shape[channel_dim] == self.x_columns_.shape[0], "New data has a different number of columns" 192 | if is_array: 193 | if self.channels_last: 194 | x_col_order = np.arange(x.shape[-1]) 195 | else: 196 | x_col_order = np.arange(x.shape[1]) 197 | else: 198 | x_col_order = self.get_column_order(x_columns) 199 | # update derived from 200 | # https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups 201 | if len(xv.shape) > 2: 202 | if self.channels_last: 203 | new_n = np.prod(xv.shape[:-1]) 204 | else: 205 | new_n = xv.shape[0] * np.prod(xv.shape[2:]) 206 | else: 207 | new_n = xv.shape[0] 208 | for i, o in enumerate(x_col_order): 209 | if self.channels_last: 210 | new_mean = np.nanmean(xv[..., i]) 211 | new_var = np.nanvar(xv[..., i]) 212 | else: 213 | new_mean = np.nanmean(xv[:, i]) 214 | new_var = np.nanvar(xv[:, i]) 215 | combined_mean = (self.n_ * self.mean_x_[o] + new_n * new_mean) / (self.n_ + new_n) 216 | weighted_var = (self.n_ * self.var_x_[o] + new_n * new_var) / (self.n_ + new_n) 217 | var_correction = (self.n_ * new_n * (self.mean_x_[o] - new_mean) ** 2) / ( 218 | (self.n_ + new_n) ** 2) 219 | self.mean_x_[o] = combined_mean 220 | self.var_x_[o] = weighted_var + var_correction 221 | self.n_ += new_n 222 | self._fit = True 223 | 224 | def transform(self, x, channels_last=None): 225 | """ 226 | Transform the input data from its original form to standard scaled form. If your input data has a 227 | different dimension order than the data used to fit the scaler, use the channels_last keyword argument 228 | to specify whether the new data are `channels_last` (True) or `channels_first` (False). 229 | 230 | Args: 231 | x: Input data. 232 | channels_last: Override the default channels_last parameter of the scaler. 233 | 234 | Returns: 235 | x_transformed: Transformed data in the same shape and type as x. 236 | """ 237 | xv, x_transformed, channels_last, channel_dim, x_col_order = self.process_x_for_transform(x, channels_last) 238 | x_mean, x_var = self.get_scales() 239 | if channels_last: 240 | for i, o in enumerate(x_col_order): 241 | x_transformed[..., i] = (xv[..., i] - x_mean[o]) / np.sqrt(x_var[o]) 242 | else: 243 | for i, o in enumerate(x_col_order): 244 | x_transformed[:, i] = (xv[:, i] - x_mean[o]) / np.sqrt(x_var[o]) 245 | x_transformed_final = self.package_transformed_x(x_transformed, x) 246 | return x_transformed_final 247 | 248 | def inverse_transform(self, x, channels_last=None): 249 | xv, x_transformed, channels_last, channel_dim, x_col_order = self.process_x_for_transform(x, channels_last) 250 | x_mean, x_var = self.get_scales() 251 | if channels_last: 252 | for i, o in enumerate(x_col_order): 253 | x_transformed[..., i] = xv[..., i] * np.sqrt(x_var[o]) + x_mean[o] 254 | else: 255 | for i, o in enumerate(x_col_order): 256 | x_transformed[:, i] = xv[:, i] * np.sqrt(x_var[o]) + x_mean[o] 257 | x_transformed = self.package_transformed_x(x_transformed, x) 258 | return x_transformed 259 | 260 | def get_scales(self): 261 | return self.mean_x_, self.var_x_ 262 | 263 | def __add__(self, other): 264 | assert type(other) is DStandardScaler, "Input is not DStandardScaler" 265 | assert np.all(other.x_columns_ == self.x_columns_), "Scaler columns do not match." 266 | current = deepcopy(self) 267 | current.mean_x_ = (self.n_ * self.mean_x_ + other.n_ * other.mean_x_) / (self.n_ + other.n_) 268 | combined_var = (self.n_ * self.var_x_ + other.n_ * other.var_x_) / (self.n_ + other.n_) 269 | combined_var_corr = (self.n_ * other.n_ * (self.mean_x_ - other.mean_x_) ** 2) / ( 270 | (self.n_ + other.n_) ** 2) 271 | current.var_x_ = combined_var + combined_var_corr 272 | current.n_ = self.n_ + other.n_ 273 | return current 274 | 275 | 276 | class DMinMaxScaler(DBaseScaler): 277 | """ 278 | Distributed MinMaxScaler enables calculation of min and max of variables in datasets in parallel then combining 279 | the mins and maxes as a reduction step. Scaler 280 | supports numpy arrays, pandas dataframes, and xarray DataArrays and will return a transformed array in the 281 | same form as the original with column or coordinate names preserved. 282 | 283 | """ 284 | def __init__(self, channels_last=True): 285 | self.max_x_ = None 286 | self.min_x_ = None 287 | super().__init__(channels_last=channels_last) 288 | 289 | def fit(self, x, weight=None): 290 | x_columns, is_array = self.extract_x_columns(x, channels_last=self.channels_last) 291 | xv = self.extract_array(x) 292 | channel_dim = self.set_channel_dim() 293 | if not self._fit: 294 | self.x_columns_ = x_columns 295 | self.is_array_ = is_array 296 | self.max_x_ = np.zeros(xv.shape[channel_dim]) 297 | self.min_x_ = np.zeros(xv.shape[channel_dim]) 298 | if self.channels_last: 299 | for i in range(xv.shape[channel_dim]): 300 | self.max_x_[i] = np.nanmax(xv[..., i]) 301 | self.min_x_[i] = np.nanmin(xv[..., i]) 302 | else: 303 | for i in range(xv.shape[channel_dim]): 304 | self.max_x_[i] = np.nanmax(xv[:, i]) 305 | self.min_x_[i] = np.nanmin(xv[:, i]) 306 | else: 307 | assert x.shape[channel_dim] == self.x_columns_.shape[0], "New data has a different number of columns" 308 | if is_array: 309 | if self.channels_last: 310 | x_col_order = np.arange(x.shape[-1]) 311 | else: 312 | x_col_order = np.arange(x.shape[1]) 313 | else: 314 | x_col_order = self.get_column_order(x_columns) 315 | if self.channels_last: 316 | for i, o in enumerate(x_col_order): 317 | self.max_x_[o] = np.maximum(self.max_x_[o], np.nanmax(xv[..., i])) 318 | self.min_x_[o] = np.minimum(self.min_x_[o], np.nanmin(xv[..., i])) 319 | else: 320 | for i, o in enumerate(xv.shape[channel_dim]): 321 | self.max_x_[o] = np.maximum(self.max_x_[o], np.nanmax(xv[:, i])) 322 | self.min_x_[o] = np.minimum(self.min_x_[o], np.nanmin(xv[:, i])) 323 | self._fit = True 324 | 325 | def transform(self, x, channels_last=None): 326 | xv, x_transformed, channels_last, channel_dim, x_col_order = self.process_x_for_transform(x, channels_last) 327 | if channels_last: 328 | for i, o in enumerate(x_col_order): 329 | x_transformed[..., i] = (xv[..., i] - self.min_x_[o]) / ( 330 | self.max_x_[o] - self.min_x_[o]) 331 | else: 332 | for i, o in enumerate(x_col_order): 333 | x_transformed[:, i] = (xv[:, i] - self.min_x_[o]) / ( 334 | self.max_x_[o] - self.min_x_[o]) 335 | x_transformed = self.package_transformed_x(x_transformed, x) 336 | return x_transformed 337 | 338 | def inverse_transform(self, x, channels_last=None): 339 | xv, x_transformed, channels_last, channel_dim, x_col_order = self.process_x_for_transform(x, channels_last) 340 | if channels_last: 341 | for i, o in enumerate(x_col_order): 342 | x_transformed[..., i] = xv[..., i] * (self.max_x_[o] - 343 | self.min_x_[o]) + self.min_x_[o] 344 | else: 345 | for i, o in enumerate(x_col_order): 346 | x_transformed[:, i] = xv[:, i] * (self.max_x_[o] - 347 | self.min_x_[o]) + self.min_x_[o] 348 | x_transformed = self.package_transformed_x(x_transformed, x) 349 | return x_transformed 350 | 351 | def get_scales(self): 352 | return self.min_x_, self.max_x_ 353 | 354 | def __add__(self, other): 355 | assert type(other) is DMinMaxScaler, "Input is not DMinMaxScaler" 356 | assert np.all(other.x_columns_ == self.x_columns_), "Scaler columns do not match." 357 | current = deepcopy(self) 358 | current.max_x_ = np.maximum(self.max_x_, other.max_x_) 359 | current.min_x_ = np.minimum(self.min_x_, other.min_x_) 360 | return current 361 | 362 | 363 | def fit_variable(var_index, xv_shared=None, compression=None, channels_last=None): 364 | xv = xv_shared 365 | td_obj = CTDigest(compression=compression) 366 | if channels_last: 367 | td_obj.update(xv[..., var_index].ravel()) 368 | else: 369 | td_obj.update(xv[:, var_index].ravel()) 370 | return td_obj 371 | 372 | 373 | def transform_variable(td_obj, xv, 374 | min_val=0.000001, max_val=0.9999999, distribution="normal"): 375 | td_centroids = td_obj.centroids() 376 | x_transformed = np.zeros_like(xv) 377 | tdigest_cdf(xv, td_centroids["mean"], td_centroids["weight"], 378 | td_obj.min(), td_obj.max(), x_transformed) 379 | x_transformed = np.minimum(x_transformed, max_val) 380 | x_transformed = np.maximum(x_transformed, min_val) 381 | if distribution == "normal": 382 | x_transformed = ndtri(x_transformed) 383 | elif distribution == "logistic": 384 | x_transformed = logistic.ppf(x_transformed) 385 | return x_transformed 386 | 387 | 388 | def inv_transform_variable(td_obj, xv, 389 | distribution="normal"): 390 | td_centroids = td_obj.centroids() 391 | x_transformed = np.zeros_like(xv) 392 | if distribution == "normal": 393 | x_intermediate = ndtr(xv) 394 | elif distribution == "logistic": 395 | x_intermediate = logistic.cdf(xv) 396 | else: 397 | x_intermediate = xv 398 | tdigest_quantile(x_intermediate, td_centroids["mean"], td_centroids["weight"], 399 | td_obj.min(), td_obj.max(), x_transformed) 400 | return x_transformed 401 | 402 | 403 | @guvectorize([void(float64[:], float64[:], float64[:], float64, float64, float64[:]), 404 | void(float32[:], float64[:], float64[:], float64, float64, float32[:])], "(m),(n),(n),(),()->(m)") 405 | def tdigest_cdf(xv, cent_mean, cent_weight, t_min, t_max, out): 406 | cent_merged_weight = np.zeros_like(cent_weight) 407 | cumulative_weight = 0 408 | for i in range(cent_weight.size): 409 | cent_merged_weight[i] = cumulative_weight + cent_weight[i] / 2.0 410 | cumulative_weight += cent_weight[i] 411 | total_weight = cent_weight.sum() 412 | for i, x in enumerate(xv): 413 | if cent_mean.size == 0: 414 | out[i] = np.nan 415 | continue 416 | # Single centroid 417 | if cent_mean.size == 1: 418 | if x < t_min: 419 | out[i] = 0.0 420 | elif x > t_max: 421 | out[i] = 1.0 422 | elif t_max - t_min < np.finfo(np.float64).eps: 423 | out[i] = 0.5 424 | else: 425 | out[i] = (x - t_min) / (t_max - t_min) 426 | continue 427 | # Equality checks only apply if > 1 centroid 428 | if x >= t_max: 429 | out[i] = 1.0 430 | continue 431 | elif x <= t_min: 432 | out[i] = 0.0 433 | continue 434 | 435 | # i_l = bisect_left_mean(T->merge_centroids, x, 0, T->ncentroids); 436 | i_l = np.searchsorted(cent_mean, x, side="left") 437 | if x < cent_mean[0]: 438 | # min < x < first centroid 439 | x0 = t_min 440 | x1 = cent_mean[0] 441 | dw = cent_merged_weight[0] / 2.0 442 | out[i] = dw * (x - x0) / (x1 - x0) / total_weight 443 | elif i_l == cent_mean.size: 444 | # last centroid < x < max 445 | x0 = cent_mean[i_l - 1] 446 | x1 = t_max 447 | dw = cent_weight[i_l - 1] / 2.0 448 | out[i] = 1.0 - dw * (x1 - x) / (x1 - x0) / total_weight 449 | elif cent_mean[i_l] == x: 450 | # x is equal to one or more centroids 451 | i_r = np.searchsorted(cent_mean, x, side="right") 452 | out[i] = cent_merged_weight[i_r] / total_weight 453 | else: 454 | assert cent_mean[i_l] > x 455 | x0 = cent_mean[i_l - 1] 456 | x1 = cent_mean[i_l] 457 | dw = 0.5 * (cent_weight[i_l - 1] + cent_weight[i_l]) 458 | out[i] = (cent_merged_weight[i_l - 1] + dw * (x - x0) / (x1 - x0)) / total_weight 459 | 460 | 461 | @guvectorize([void(float64[:], float64[:], float64[:], float64, float64, float64[:]), 462 | void(float32[:], float64[:], float64[:], float64, float64, float32[:])], "(m),(n),(n),(),()->(m)") 463 | def tdigest_quantile(qv, cent_mean, cent_weight, t_min, t_max, out): 464 | cent_merged_weight = np.zeros_like(cent_weight) 465 | cumulative_weight = 0 466 | for i in range(cent_weight.size): 467 | cent_merged_weight[i] = cumulative_weight + cent_weight[i] / 2.0 468 | cumulative_weight += cent_weight[i] 469 | total_weight = cent_weight.sum() 470 | for i, q in enumerate(qv): 471 | if total_weight == 0: 472 | out[i] = np.nan 473 | continue 474 | if q <= 0: 475 | out[i] = t_min 476 | continue 477 | if q >= 1: 478 | out[i] = t_max 479 | continue 480 | if cent_mean.size == 1: 481 | out[i] = cent_mean[0] 482 | continue 483 | 484 | index = q * total_weight 485 | b = np.searchsorted(cent_merged_weight, index, side="left") 486 | if b == 0: 487 | x0 = 0 488 | y0 = t_min 489 | else: 490 | x0 = cent_merged_weight[b - 1] 491 | y0 = cent_mean[b - 1] 492 | 493 | if b == cent_mean.size: 494 | x1 = total_weight 495 | y1 = t_max 496 | else: 497 | x1 = cent_merged_weight[b] 498 | y1 = cent_mean[b] 499 | out[i] = y0 + (index - x0) * (y1 - y0) / (x1 - x0) 500 | 501 | 502 | class DQuantileScaler(DBaseScaler): 503 | """ 504 | Distributed Quantile Scaler that uses the crick TDigest Cython library to compute quantiles across multiple 505 | datasets in parallel. The library can perform fitting, transforms, and inverse transforms across variables 506 | in parallel using the multiprocessing library. Multidimensional arrays are stored in shared memory across 507 | processes to minimize inter-process communication. 508 | 509 | DQuantileScaler supports 510 | 511 | Attributes: 512 | compression: Recommended number of centroids to use. 513 | distribution: "uniform", "normal", or "logistic". 514 | min_val: Minimum value for quantile to prevent -inf results when distribution is normal or logistic. 515 | max_val: Maximum value for quantile to prevent inf results when distribution is normal or logistic. 516 | channels_last: Whether to assume the last dim or second dim are the channel/variable dimension. 517 | """ 518 | def __init__(self, compression=250, distribution="uniform", min_val=0.0000001, max_val=0.9999999, channels_last=True): 519 | self.compression = compression 520 | self.distribution = distribution 521 | self.min_val = min_val 522 | self.max_val = max_val 523 | self.centroids_ = None 524 | self.size_ = None 525 | self.min_ = None 526 | self.max_ = None 527 | 528 | super().__init__(channels_last=channels_last) 529 | 530 | def td_objs_to_attributes(self, td_objs): 531 | self.centroids_ = [structured_to_unstructured(td_obj.centroids()) for td_obj in td_objs] 532 | self.size_ = np.array([td_obj.size() for td_obj in td_objs]) 533 | self.min_ = np.array([td_obj.min() for td_obj in td_objs]) 534 | self.max_ = np.array([td_obj.max() for td_obj in td_objs]) 535 | return 536 | 537 | def attributes_to_td_objs(self): 538 | td_objs = [] 539 | if self.is_fit(): 540 | for i in range(self.max_.size): 541 | td_objs.append(CTDigest(self.compression)) 542 | td_objs[-1].__setstate__((unstructured_to_structured(self.centroids_[i], CENTROID_DTYPE), 543 | self.size_[i], 544 | self.min_[i], 545 | self.max_[i])) 546 | return td_objs 547 | 548 | def fit(self, x, weight=None): 549 | x_columns, is_array = self.extract_x_columns(x, channels_last=self.channels_last) 550 | xv = self.extract_array(x) 551 | channel_dim = self.set_channel_dim() 552 | if not self._fit: 553 | self.x_columns_ = x_columns 554 | self.is_array_ = is_array 555 | fit_var_func = partial(fit_variable, 556 | xv_shared=xv, 557 | compression=self.compression, 558 | channels_last=self.channels_last) 559 | td_objs = [fit_var_func(x) for x in np.arange(xv.shape[channel_dim])] 560 | self.td_objs_to_attributes(td_objs) 561 | else: 562 | assert x.shape[channel_dim] == self.x_columns_.shape[0], "New data has a different number of columns" 563 | if is_array: 564 | if self.channels_last: 565 | x_col_order = np.arange(x.shape[-1]) 566 | else: 567 | x_col_order = np.arange(x.shape[1]) 568 | else: 569 | x_col_order = self.get_column_order(x_columns) 570 | td_objs = self.attributes_to_td_objs() 571 | fit_var_func = partial(fit_variable, 572 | xv_shared=xv, 573 | compression=self.compression, 574 | channels_last=self.channels_last) 575 | new_td_objs = [fit_var_func(x) for x in np.arange(xv.shape[channel_dim])] 576 | for i, o in enumerate(x_col_order): 577 | td_objs[o].merge(new_td_objs[i]) 578 | self.td_objs_to_attributes(td_objs) 579 | self._fit = True 580 | return 581 | 582 | def transform(self, x, channels_last=None, pool=None): 583 | xv, x_transformed, channels_last, channel_dim, x_col_order = self.process_x_for_transform(x, channels_last) 584 | td_objs = self.attributes_to_td_objs() 585 | td_i_objs = [(i, td_objs[o]) for i, o in enumerate(x_col_order)] 586 | 587 | trans_var_func = partial(transform_variable, 588 | min_val=self.min_val, max_val=self.max_val, 589 | distribution=self.distribution) 590 | if channels_last: 591 | if pool is not None: 592 | split_indices = np.round(np.linspace(0, xv[..., 0].size, pool._processes)).astype(int) 593 | xt_shape = x_transformed[..., 0].shape 594 | outputs = [] 595 | for td_obj in td_i_objs: 596 | for s, s_ind in enumerate(split_indices[1:]): 597 | outputs.append(pool.apply_async(trans_var_func, (td_obj[1], 598 | xv[..., td_obj[0]].ravel()[split_indices[s]:s_ind]))) 599 | x_transformed[..., td_obj[0]] = np.reshape(np.concatenate([o.get() for o in outputs]), xt_shape) 600 | del outputs[:] 601 | else: 602 | for td_obj in td_i_objs: 603 | x_transformed[..., td_obj[0]] = trans_var_func(td_obj[1], xv[..., td_obj[0]]) 604 | else: 605 | if pool is not None: 606 | split_indices = np.round(np.linspace(0, xv[..., 0].size, pool._processes)).astype(int) 607 | xt_shape = x_transformed[:, 0].shape 608 | outputs = [] 609 | for td_obj in td_i_objs: 610 | for s, s_ind in enumerate(split_indices[1:]): 611 | outputs.append(pool.apply_async(trans_var_func, (td_obj[1], 612 | xv[..., td_obj[0]].ravel()[split_indices[s]:s_ind]))) 613 | x_transformed[:, td_obj[0]] = np.reshape(np.concatenate([o.get() for o in outputs]), xt_shape) 614 | del outputs[:] 615 | else: 616 | for td_obj in td_i_objs: 617 | x_transformed[:, td_obj[0]] = trans_var_func(td_obj[1], xv[:, td_obj[0]]) 618 | x_transformed_final = self.package_transformed_x(x_transformed, x) 619 | return x_transformed_final 620 | 621 | def fit_transform(self, x, channels_last=None, weight=None, pool=None): 622 | self.fit(x, weight=weight) 623 | return self.transform(x, channels_last=channels_last, pool=pool) 624 | 625 | def inverse_transform(self, x, channels_last=None, pool=None): 626 | xv, x_transformed, channels_last, channel_dim, x_col_order = self.process_x_for_transform(x, channels_last) 627 | td_objs = self.attributes_to_td_objs() 628 | td_i_objs = [(i, td_objs[o]) for i, o in enumerate(x_col_order)] 629 | inv_trans_var_func = partial(inv_transform_variable, 630 | distribution=self.distribution) 631 | if channels_last: 632 | if pool is not None: 633 | split_indices = np.round(np.linspace(0, xv[..., 0].size, pool._processes)).astype(int) 634 | xt_shape = x_transformed[..., 0].shape 635 | outputs = [] 636 | for td_obj in td_i_objs: 637 | for s, s_ind in enumerate(split_indices[1:]): 638 | outputs.append(pool.apply_async(inv_trans_var_func, (td_obj[1], 639 | xv[..., td_obj[0]].ravel()[split_indices[s]:s_ind]))) 640 | x_transformed[..., td_obj[0]] = np.reshape(np.concatenate([o.get() for o in outputs]), xt_shape) 641 | del outputs[:] 642 | else: 643 | for td_obj in td_i_objs: 644 | x_transformed[..., td_obj[0]] = inv_trans_var_func(td_obj[1], xv[..., td_obj[0]]) 645 | else: 646 | if pool is not None: 647 | split_indices = np.round(np.linspace(0, xv[..., 0].size, pool._processes)).astype(int) 648 | xt_shape = x_transformed[:, 0].shape 649 | outputs = [] 650 | for td_obj in td_i_objs: 651 | for s, s_ind in enumerate(split_indices[1:]): 652 | outputs.append(pool.apply_async(inv_trans_var_func, (td_obj[1], 653 | xv[..., td_obj[0]].ravel()[split_indices[s]:s_ind]))) 654 | x_transformed[:, td_obj[0]] = np.reshape(np.concatenate([o.get() for o in outputs]), xt_shape) 655 | del outputs[:] 656 | else: 657 | for td_obj in td_i_objs: 658 | x_transformed[:, td_obj[0]] = inv_trans_var_func(td_obj[1], xv[:, td_obj[0]]) 659 | x_transformed_final = self.package_transformed_x(x_transformed, x) 660 | return x_transformed_final 661 | 662 | def __add__(self, other): 663 | current = deepcopy(self) 664 | td_objs = current.attributes_to_td_objs() 665 | other_td_objs = other.attributes_to_td_objs() 666 | assert type(other) is DQuantileScaler, "Adding mismatched scaler types." 667 | assert current.is_fit() and other.is_fit(), "At least one scaler is not fit." 668 | x_col_order = current.get_column_order(other.x_columns_) 669 | assert x_col_order.size > 0, "No matching columns in other DQuantileScaler" 670 | for i, o in enumerate(x_col_order): 671 | td_objs[o].merge(other_td_objs[i]) 672 | current.td_objs_to_attributes(td_objs) 673 | return current 674 | 675 | --------------------------------------------------------------------------------