├── .gitignore ├── tests ├── __init__.py └── dataschema │ ├── __init__.py │ ├── test_calculator.py │ ├── test_anonymize.py │ └── test_schemagen.py ├── requirements.txt ├── docs ├── reference │ └── dataschema.md └── index.md ├── MANIFEST.in ├── mercury └── dataschema │ ├── __init__.py │ ├── create_tutorials.py │ ├── calculator.py │ ├── tutorials │ └── hello_dataschema.ipynb │ ├── anonymize.py │ ├── feature.py │ └── schemagen.py ├── CHANGELOG.md ├── .bumpversion.cfg ├── .github └── workflows │ ├── pypi_upload.yml │ └── test.yml ├── mkdocs.yml ├── pyproject.toml ├── README.md └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/dataschema/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cryptography 2 | numpy 3 | -------------------------------------------------------------------------------- /docs/reference/dataschema.md: -------------------------------------------------------------------------------- 1 | # Data Schema 2 | 3 | ::: mercury.dataschema -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft mercury/dataschema/ 2 | 3 | recursive-include mercury/dataschema/tutorials * 4 | -------------------------------------------------------------------------------- /mercury/dataschema/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '1.1.2' 2 | 3 | from .schemagen import DataSchema 4 | from .create_tutorials import create_tutorials 5 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## Latest version 1.1.2 2 | 3 | | Release | Date | Main feature(s) | 4 | | -------- | ---- | --------------- | 5 | | 1.1.2 | 2025/02/11 | Implements create_tutorials(), adds support for python 3.13, improves documentation. | 6 | -------------------------------------------------------------------------------- /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 1.1.2 3 | parse = (?P\d+)\.(?P\d+)\.(?P\d+)(-(?P\w+)\.(?P\d+))? 4 | serialize = 5 | {major}.{minor}.{patch}-{release}.{build} 6 | {major}.{minor}.{patch} 7 | commit = True 8 | tag = True 9 | 10 | [bumpversion:file:mercury/dataschema/__init__.py] 11 | 12 | [bumpversion:file:README.md] 13 | 14 | [bumpversion:file:docs/index.md] 15 | 16 | [bumpversion:file:pyproject.toml] 17 | -------------------------------------------------------------------------------- /.github/workflows/pypi_upload.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v3 12 | - name: Set up Python 13 | uses: actions/setup-python@v4 14 | with: 15 | python-version: '3.10' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install wheel 20 | pip install build 21 | - name: Build package 22 | run: | 23 | python -m build 24 | - name: Publish package 25 | uses: pypa/gh-action-pypi-publish@release/v1 26 | with: 27 | user: ${{ secrets.pypi_user }} 28 | password: ${{ secrets.pypi_password }} 29 | packages_dir: ./dist/ 30 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: mercury-dataschema 2 | repo_url: https://github.com/BBVA/mercury-dataschema/ 3 | repo_name: mercury-dataschema 4 | theme: 5 | name: material 6 | features: 7 | - tabs 8 | - navigation.indexes 9 | icon: 10 | logo: material/book-open-page-variant 11 | repo: fontawesome/brands/github 12 | site_dir: site 13 | nav: 14 | - Home: index.md 15 | #- Contributing: 16 | # - How to contribute: CONTRIBUTING.md 17 | - Api: 18 | - dataschema: reference/dataschema.md 19 | markdown_extensions: 20 | - codehilite 21 | - admonition 22 | - pymdownx.superfences 23 | - pymdownx.arithmatex: 24 | generic: true 25 | extra_css: 26 | - stylesheets/extra.css 27 | extra_javascript: 28 | - javascripts/config.js 29 | - https://polyfill.io/v3/polyfill.min.js?features=es6 30 | - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js 31 | plugins: 32 | - mkdocstrings: 33 | handlers: 34 | python: 35 | options: 36 | show_root_heading: true 37 | show_submodules: true 38 | merge_init_into_class: true 39 | docstring_style: google 40 | dev_addr: 0.0.0.0:8080 -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "mercury-dataschema" 7 | license = {file = "LICENSE.txt"} 8 | version = "1.1.2" 9 | authors = [ 10 | { name="Mercury Team", email="mercury.group@bbva.com" }, 11 | ] 12 | description = "Mercury's DataSchema package allows the automatic recognition and validation of feature types." 13 | readme = "README.md" 14 | requires-python = ">=3.7" 15 | classifiers = [ 16 | "Programming Language :: Python :: 3", 17 | "License :: OSI Approved :: Apache Software License", 18 | "Operating System :: OS Independent", 19 | ] 20 | dependencies = [ 21 | 'numpy', 22 | 'cryptography' 23 | ] 24 | 25 | [project.optional-dependencies] 26 | dev = [ 27 | 'seaborn', 28 | 'pytest', 29 | 'flake8', 30 | ] 31 | doc = [ 32 | 'mkdocs', 33 | 'mkdocstrings[python]', 34 | 'mkdocs-material', 35 | 'mkdocs-minify-plugin==0.5.0', 36 | 'mkdocs-exclude', 37 | 'nbconvert', 38 | ] 39 | 40 | [project.urls] 41 | "Homepage" = "https://github.com/BBVA/mercury-dataschema" 42 | "Bug Tracker" = "https://github.com/BBVA/mercury-dataschema/issues" 43 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Mercury-Dataschema 2 | 3 | on: 4 | push: 5 | branches: [ "master", "develop" ] 6 | pull_request: 7 | branches: [ "master", "develop" ] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v3 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install package 24 | run: | 25 | python -m pip install --upgrade pip 26 | python -m pip install flake8 pytest build 27 | python -m pip install -e .[dev] 28 | - name: Lint with flake8 29 | run: | 30 | # stop the build if there are Python syntax errors or undefined names 31 | flake8 . --count --select=E9,F63,F7,F82 --show-source --max-line-length=127 --statistics 32 | - name: Test with pytest 33 | run: | 34 | pytest 35 | - name: Test build 36 | run: | 37 | python -m build -------------------------------------------------------------------------------- /mercury/dataschema/create_tutorials.py: -------------------------------------------------------------------------------- 1 | import os, pkg_resources, shutil 2 | 3 | 4 | def create_tutorials(destination, silent = False): 5 | """ 6 | Copies mercury.dataschema tutorial notebooks to `destination`. A folder will be created inside 7 | destination, named 'dataschema_tutorials'. The folder `destination` must exist. 8 | 9 | Args: 10 | destination (str): The destination directory 11 | silent (bool): If True, suppresses output on success. 12 | 13 | Raises: 14 | ValueError: If `destination` is equal to source path. 15 | 16 | Examples: 17 | >>> # copy tutorials to /tmp/dataschema_tutorials 18 | >>> from mercury.dataschema import create_tutorials 19 | >>> create_tutorials('/tmp') 20 | 21 | """ 22 | src = pkg_resources.resource_filename(__package__, 'tutorials') 23 | dst = os.path.abspath(destination) 24 | 25 | assert src != dst, 'Destination (%s) cannot be the same as source.' % src 26 | 27 | assert os.path.isdir(dst), 'Destination (%s) must be a directory.' % dst 28 | 29 | dst = os.path.join(dst, 'dataschema_tutorials') 30 | 31 | assert not os.path.exists(dst), 'Destination (%s) already exists' % dst 32 | 33 | shutil.copytree(src, dst) 34 | 35 | if not silent: 36 | print('Tutorials copied to: %s' % dst) 37 | -------------------------------------------------------------------------------- /tests/dataschema/test_calculator.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pandas as pd 3 | import numpy as np 4 | 5 | from mercury.dataschema.calculator import StatCalculatorFactory, PandasStatCalculator 6 | from mercury.dataschema.feature import Feature 7 | 8 | 9 | @pytest.fixture(scope='module') 10 | def pandas_df(): 11 | data = [['tom', 10], ['nick', 15], ['juli', 14]] 12 | return pd.DataFrame(data, columns=['Name', 'Age']) 13 | 14 | 15 | def test_calculator_factory(pandas_df): 16 | assert isinstance(StatCalculatorFactory.build_calculator(pandas_df), PandasStatCalculator) 17 | 18 | 19 | def test_calculator(pandas_df): 20 | calculator = StatCalculatorFactory.build_calculator(pandas_df) 21 | 22 | feature = Feature() 23 | 24 | calculator.min(pandas_df['Age'], feature) 25 | calculator.max(pandas_df['Age'], feature) 26 | calculator.std(pandas_df['Age'], feature) 27 | calculator.mean(pandas_df['Age'], feature) 28 | 29 | assert feature.stats['min'] == 10 30 | assert feature.stats['max'] == 15 31 | assert feature.stats['mean'] == 13 32 | 33 | 34 | def test_set_config(pandas_df): 35 | calculator = StatCalculatorFactory.build_calculator(pandas_df) 36 | with pytest.raises(ValueError): 37 | calculator.set_config(**{'nonexistingattr': 10}) 38 | 39 | # assert it assigns the property well 40 | calculator.set_config(**{'distribution_bins_method': 10}) 41 | assert calculator.distribution_bins_method == 10 42 | 43 | # Assert does nothing with None 44 | calculator.set_config() 45 | -------------------------------------------------------------------------------- /tests/dataschema/test_anonymize.py: -------------------------------------------------------------------------------- 1 | import os, pytest 2 | 3 | from mercury.dataschema.anonymize import Anonymize 4 | 5 | 6 | def test_anonymize(): 7 | os.environ['MERCURY_ANONYMIZE_DATASCHEMA_KEY'] = 'Mickey Mouse' 8 | 9 | am1 = Anonymize() 10 | am2 = Anonymize(6*6) 11 | 12 | assert am1.hash_key == am2.hash_key 13 | 14 | del os.environ['MERCURY_ANONYMIZE_DATASCHEMA_KEY'] 15 | 16 | an1 = Anonymize(0) 17 | an2 = Anonymize(20*6) 18 | an3 = Anonymize(0, True) 19 | 20 | assert an1.hash_key == an2.hash_key and an1.hash_key == an3.hash_key and an1.hash_key != am1.hash_key 21 | 22 | pl = ['a', 'little', 'bit', 'of text.', 'a', 'ittle', 'bit', 'more.', 'A'] 23 | 24 | cp_am1 = am1.anonymize_list(pl) 25 | 26 | assert [len(s) for s in cp_am1] == [16 for _ in range(9)] 27 | assert cp_am1[0] == cp_am1[4] and cp_am1[1] != cp_am1[5] and cp_am1[2] == cp_am1[6] and cp_am1[0] != cp_am1[8] 28 | 29 | cp_am2 = am2.anonymize_list(pl) 30 | 31 | assert [len(s) for s in cp_am2] == [6 for _ in range(9)] 32 | assert cp_am2[0] == cp_am2[4] and cp_am2[1] != cp_am2[5] and cp_am2[2] == cp_am2[6] and cp_am2[0] != cp_am2[8] 33 | assert [s.startswith(t) for s, t in zip(cp_am1, cp_am2)] == [True for _ in range(9)] 34 | 35 | cp_an1 = an1.anonymize_list(pl) 36 | 37 | assert [len(s) > 16 for s in cp_an1] == [True for _ in range(9)] 38 | assert cp_an1[0] == cp_an1[4] and cp_an1[1] != cp_an1[5] and cp_an1[2] == cp_an1[6] and cp_an1[0] != cp_an1[8] 39 | 40 | cp_an2 = an2.anonymize_list(pl) 41 | 42 | assert [len(s) for s in cp_an2] == [20 for _ in range(9)] 43 | assert cp_an2[0] == cp_an2[4] and cp_an2[1] != cp_an2[5] and cp_an2[2] == cp_an2[6] and cp_an2[0] != cp_an2[8] 44 | 45 | cp_an3 = an3.anonymize_list(pl) 46 | 47 | assert [len(s) > 16 for s in cp_an3] == [True for _ in range(9)] 48 | assert cp_an3[0] != cp_an3[4] and cp_an3[1] != cp_an3[5] and cp_an3[2] != cp_an3[6] and cp_an3[0] != cp_an3[8] 49 | assert [len(t) - len(s) for s, t in zip(cp_an1, cp_an3)] == [16 for _ in range(9)] 50 | 51 | with pytest.raises(ValueError): 52 | pl = am1.deanonymize_list(cp_am1) 53 | 54 | with pytest.raises(ValueError): 55 | pl = am2.deanonymize_list(cp_am2) 56 | 57 | with pytest.raises(ValueError): 58 | pl = an2.deanonymize_list(cp_an2) 59 | 60 | pl1 = an1.deanonymize_list(cp_an1) 61 | 62 | assert pl1 == pl 63 | 64 | pl3 = an3.deanonymize_list(cp_an3) 65 | 66 | assert pl3 == pl 67 | 68 | bm1 = Anonymize() 69 | 70 | assert am1.hash_key != bm1.hash_key 71 | 72 | cp_bm1 = bm1.anonymize_list(pl) 73 | 74 | assert cp_am1 != cp_bm1 75 | 76 | bm1.set_key('Mickey Mouse') 77 | 78 | assert am1.hash_key == bm1.hash_key 79 | 80 | cp_bm1 = bm1.anonymize_list(pl) 81 | 82 | assert cp_am1 == cp_bm1 83 | -------------------------------------------------------------------------------- /mercury/dataschema/calculator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import Union 3 | 4 | from .feature import BinaryFeature, CategoricalFeature 5 | 6 | 7 | class FeatureCalculator(): 8 | """ This is a base class with the operation definitions. Several classes must 9 | extend this, implementing its operations for each one of the supported frameworks 10 | (namely Pandas and Pyspark) 11 | """ 12 | 13 | def __init(self): 14 | pass 15 | 16 | def min(self, column, feature): 17 | pass 18 | 19 | def max(self, column, feature): 20 | pass 21 | 22 | def distribution(self, column, feature, bins=None): 23 | pass 24 | 25 | @property 26 | def _registered_params(self): 27 | return list(self.__dict__.keys()) 28 | 29 | def set_config(self, **kwargs): 30 | """ Set attributes with the keys of the dictionary. These can be later used within 31 | specific calculator methods (like `distribution()` for specifying the number of bins). 32 | 33 | For this to work, the parameter must have been explicitly declared during object's 34 | constructor. That is, you cannot pass here a parameter name which the calculator doesn't 35 | support (or this will raise a ValueError). 36 | 37 | Args: 38 | **kwargs (dict): The names and values of the desired parameters to set. 39 | 40 | Raises ValueError if any keyword argument does not exist among the existing attributes of 41 | the object. 42 | """ 43 | if kwargs is None: 44 | return 45 | 46 | for key, val in kwargs.items(): 47 | if not hasattr(self, key): 48 | raise ValueError( 49 | f"Error. This calculator doesn't support the `{key}` parameter. Available options are {self._registered_params}" 50 | ) 51 | setattr(self, key, val) 52 | 53 | 54 | class PandasStatCalculator(FeatureCalculator): 55 | """ Implementation of a Calculator for Pandas 56 | 57 | Supported setting keys are the following: 58 | 59 | - `distribution_bins_method`: The method for setting the number of bins when 60 | calling the `distribution` method. Note that this only has effect when feature is 61 | either discrete or continuous. 62 | - `limit_categorical_perc`: The method for truncating categorical variables with 63 | high cardinality 64 | """ 65 | def __init__(self): 66 | super().__init__() 67 | self.distribution_bins_method = 'sqrt' 68 | self.limit_categorical_perc = None 69 | 70 | def min(self, column, feature): 71 | feature.stats['min'] = float(column.min()) 72 | 73 | def max(self, column, feature): 74 | feature.stats['max'] = float(column.max()) 75 | 76 | def distribution(self, column, feature, bins=None): 77 | """ Calculates the histogram for a given feature. 78 | 79 | Args: 80 | column (pd.Series): Pandas column with the data 81 | feature (Feature): Feature which holds the metadata 82 | bins (Union[int, str, None]): (Only used for numerical features) If a number, the histogram will 83 | have `bins` bins. If a string, it will use an automatic NumPy method for 84 | estimating this number. See more about available methods here: 85 | https://numpy.org/devdocs/reference/generated/numpy.histogram_bin_edges.html#numpy.histogram_bin_edges. 86 | If None is provided, it uses the default class' method, which is `sqrt`. 87 | For binary features it simply uses bins=2 and for categoricals, bins=|categories| if is not limited 88 | with 'limit_categorical_perc' in set_config method. 89 | """ 90 | if 'no_nan_filtered' not in feature.cache: 91 | no_na = column.dropna() 92 | feature.cache['no_nan_filtered'] = no_na 93 | else: 94 | no_na = feature.cache['no_nan_filtered'] 95 | 96 | if isinstance(feature, (BinaryFeature, CategoricalFeature)): 97 | 98 | no_na = no_na[no_na.isin(feature.stats['domain'])] # It may be truncated 99 | t = (no_na.value_counts() / len(no_na)).sort_index() 100 | feature.stats['distribution'] = t.values 101 | feature.stats['distribution'] = [float(x) for x in feature.stats['distribution']] 102 | feature.stats['distribution_bins'] = list(t.index) 103 | 104 | else: 105 | bins = self.distribution_bins_method if not bins else bins 106 | histo = np.histogram(no_na, bins=bins) 107 | feature.stats['distribution'] = list(histo[0] / no_na.count()) 108 | feature.stats['distribution'] = [float(x) for x in feature.stats['distribution']] 109 | feature.stats['distribution_bins'] = list(histo[1]) 110 | 111 | def std(self, column, feature): 112 | feature.stats['std'] = column.std() 113 | 114 | def mean(self, column, feature): 115 | feature.stats['mean'] = column.mean() 116 | 117 | 118 | class SparkStatCalculator(FeatureCalculator): 119 | def __init__(self): 120 | pass 121 | 122 | 123 | class StatCalculatorFactory: 124 | """ This static class receives a DataFrame and returns a particular implementation 125 | of a FeatureCalculator 126 | """ 127 | @classmethod 128 | def build_calculator( 129 | cls, 130 | dataframe: Union["pandas.DataFrame", "pyspark.sql.DataFrame"] # noqa: F821 131 | ) -> FeatureCalculator: 132 | 133 | if "pyspark" in str(type(dataframe)): 134 | raise RuntimeError("Sorry, Pyspark is not supported yet...") 135 | 136 | return PandasStatCalculator() 137 | -------------------------------------------------------------------------------- /mercury/dataschema/tutorials/hello_dataschema.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "475f1494-94c6-4a6d-bfc7-e4c72809b0be", 6 | "metadata": {}, 7 | "source": [ 8 | "# A simple hands-on mercury-dataschema" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "e10b0892-b219-42aa-8d8b-21ceb77dcaa4", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from mercury.dataschema import DataSchema\n", 19 | "from mercury.dataschema.anonymize import Anonymize\n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "id": "f4ccb99f-57ac-4360-85e3-c8a2ff9ea8dd", 25 | "metadata": {}, 26 | "source": [ 27 | "## Getting a dataset from seaborn examples\n", 28 | "\n", 29 | "We import seaborn just in case just to load the tips dataset to play with it.\n", 30 | "\n", 31 | "We pip install it first." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "08e37205-414e-4871-a594-33aab0efee03", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "!pip install seaborn" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "3e83f082-4434-4994-9229-ea517b20968f", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "import seaborn as sns\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "id": "ed45555b-a0ba-4d05-a42a-596efaa25d84", 57 | "metadata": {}, 58 | "source": [ 59 | "We change the types of the strings to string." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "id": "0e956955-4edc-4ca4-9944-e819fde35884", 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "tips = sns.load_dataset('tips')\n", 70 | "tips['sex'] = tips['sex'].astype(str)\n", 71 | "tips['smoker'] = tips['smoker'].astype(str)\n", 72 | "tips['day'] = tips['day'].astype(str)\n", 73 | "tips['time'] = tips['time'].astype(str)\n", 74 | "\n", 75 | "tips\n" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "id": "d1370edb-5fc7-4c09-a604-37c6f4b54cfe", 81 | "metadata": {}, 82 | "source": [ 83 | "## Automated type detection" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "id": "1e5279db-e38e-4331-b098-7f8514880122", 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "schema = DataSchema().generate(tips)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "id": "b53ff5e4-31de-42eb-accc-29674a2af640", 99 | "metadata": {}, 100 | "source": [ 101 | "The method `.generate` generates for each of the columns an object of class Feature that allows abstracting its details\n", 102 | "and using it in the same way across types.\n", 103 | "\n", 104 | "This is how many mercury packages work.\n", 105 | "\n", 106 | "As you can see in the previous warning, it treats an integer variable as categorical because it has only two values. This behavior can be controlled \n", 107 | "\n", 108 | " * [see documentation](https://bbva.github.io/mercury-dataschema/)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "id": "0e50bfcd-a116-41e8-bcf2-0a6a7c99fb38", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "schema.feats" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "id": "cdaaddc0-b022-422c-8865-82acca7e467d", 124 | "metadata": {}, 125 | "source": [ 126 | "## Anonymize example\n", 127 | "\n", 128 | "The pckage also includes an Anonymize class that supports multiple key management functions, controlable precision and secure cryptography." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "id": "151e4e70-0138-4bd5-9e86-415f46853a9f", 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "anon = Anonymize()" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "id": "ed9a1d2a-dcb6-460b-af3f-90638a0cc567", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "anon.set_key('Mickey Mouse')" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "id": "26159c27-6e27-4185-aa40-207372c539c5", 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "anon.anonymize_list_any_type(list(tips['total_bill']))[0:10]" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "id": "413441db-580c-49c4-8864-fa452dda59b2", 164 | "metadata": {}, 165 | "source": [ 166 | "## Same example with shorter digest length\n", 167 | "\n", 168 | "We run the same example with 12 bit digest (2 base-64 digits)." 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "id": "4e7bb476-cf2d-40d4-b9bb-b2c94cebe5e4", 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "anon = Anonymize(digest_bits = 12)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "id": "388ccda3-e723-4dfe-b0ce-70a7c722aec0", 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "anon.set_key('Mickey Mouse')" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "id": "29a7aa87-dfda-4f5d-a5f4-b8809bd3ff54", 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "anon.anonymize_list_any_type(list(tips['total_bill']))[0:10]" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "id": "27f406f7-6e32-4b42-9089-79dce4484a26", 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [] 208 | } 209 | ], 210 | "metadata": { 211 | "kernelspec": { 212 | "display_name": "Python 3 (ipykernel)", 213 | "language": "python", 214 | "name": "python3" 215 | }, 216 | "language_info": { 217 | "codemirror_mode": { 218 | "name": "ipython", 219 | "version": 3 220 | }, 221 | "file_extension": ".py", 222 | "mimetype": "text/x-python", 223 | "name": "python", 224 | "nbconvert_exporter": "python", 225 | "pygments_lexer": "ipython3", 226 | "version": "3.13.0" 227 | } 228 | }, 229 | "nbformat": 4, 230 | "nbformat_minor": 5 231 | } 232 | -------------------------------------------------------------------------------- /mercury/dataschema/anonymize.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from cryptography.hazmat.primitives.ciphers.aead import AESGCM 4 | from cryptography.hazmat.primitives import hashes 5 | 6 | import base64 7 | import math 8 | 9 | 10 | class Anonymize: 11 | 12 | """Cryptographically secure anonymization. 13 | 14 | This class encrypts or hashes lists of strings using cryptographically secure standardized algorithms. 15 | It can be used with a user defined key or without a key in which case it will produce identical hashes 16 | across different platforms. 17 | 18 | The key can be given at construction time by setting the environment variable MERCURY_ANONYMIZE_DATASCHEMA_KEY 19 | or at any later time by calling the .set_key() method. 20 | 21 | Args: 22 | digest_bits (int): This determines the length in (effective) bits of the output hash. As it is encoded in base64, 23 | the number of characters will be 1/6 times this number. E.g., 96 (the default) produces 16 24 | char long hashes. If this is set to a value other than zero, the output length is fixed, the 25 | output is irreversible (cannot be used with .deanonymize_list()) and the algorithm used for 26 | hashing is keyed BLAKE2 (https://www.blake2.net/). 27 | If this is set to zero, you will get a variable length secure encryption using Galois/Counter 28 | Mode AES. (see the argument `safe_crypto`) and the result can be deanonymized with the same key 29 | using .deanonymize_list(). 30 | safe_crypto (bool): This argument selects how the encryption is randomized. If True, the same original text with 31 | the same key produces different encrypted texts each time. Note that this will change the 32 | cardinality of the set of values to the length of the list. 33 | If false (the default) the same text will produce the same output with the same key. This 34 | preserves cardinality, but can be a target of attacks when the attacker has access to 35 | encoded pairs. 36 | """ 37 | 38 | def __init__(self, digest_bits=96, safe_crypto=False): 39 | self.digest_bits = digest_bits 40 | self.safe_crypto = safe_crypto 41 | 42 | plain_key = os.environ.get('MERCURY_ANONYMIZE_DATASCHEMA_KEY') 43 | plain_key = '' if plain_key is None else plain_key 44 | 45 | hash_key = hashes.Hash(hashes.BLAKE2s(32)) 46 | 47 | hash_key.update(plain_key.encode('utf-8')) 48 | 49 | self.hash_key = hash_key.finalize()[0:16] 50 | 51 | def set_key(self, encryption_key): 52 | """Set the encryption key of an existing `Anonymize` object. 53 | 54 | This changes the encryption key overriding the key possibly defined using the environment variable 55 | MERCURY_ANONYMIZE_DATASCHEMA_KEY at construction. It can be called any number of times. 56 | 57 | Args: 58 | encryption_key (list): The key as a string. 59 | """ 60 | hash_key = hashes.Hash(hashes.BLAKE2s(32)) 61 | 62 | hash_key.update(encryption_key.encode('utf-8')) 63 | 64 | self.hash_key = hash_key.finalize()[0:16] 65 | 66 | def anonymize_list(self, list_of_str): 67 | """Anonymize a list of strings. 68 | 69 | This hashes or encrypts a list of strings. The precise function is defined at object construction. 70 | (See the doc of the class `Anonymize` for details.) 71 | 72 | Args: 73 | list_of_str (list): A list of strings to be anonymized. 74 | 75 | Returns (list): 76 | The anonymized list of strings encoded in base64. 77 | """ 78 | l2 = list() 79 | 80 | if self.digest_bits != 0: 81 | digest_len = math.ceil(self.digest_bits / 6) 82 | 83 | for s in list_of_str: 84 | hash = hashes.Hash(hashes.BLAKE2b(64)) 85 | hash.update(self.hash_key) 86 | hash.update(s.encode('utf-8')) 87 | 88 | l2.append(base64.encodebytes(hash.finalize()).decode()[0:digest_len]) 89 | else: 90 | aes = AESGCM(self.hash_key) 91 | 92 | if self.safe_crypto: 93 | for s in list_of_str: 94 | nonce = os.urandom(12) # Must be >8 (min requirement) and multiple of 6 (fixed length in) 95 | cipher = aes.encrypt(nonce, s.encode('utf-8'), None) 96 | 97 | l2.append(base64.encodebytes(nonce + cipher).decode()) 98 | else: 99 | nonce = b'12345678' 100 | for s in list_of_str: 101 | cipher = aes.encrypt(nonce, s.encode('utf-8'), None) 102 | 103 | l2.append(base64.encodebytes(cipher).decode()) 104 | 105 | return l2 106 | 107 | def anonymize_list_any_type(self, list_of_any): 108 | """Anonymize a list of anything that supports conversion to string. 109 | 110 | This is a wrapper function over anonymize_list(). It verifies if any element in the list is 111 | not a string first. If all elements are strings, it passes the list to anonymize_list(). 112 | Otherwise, it creates a new list of string elements and passes that to anonymize_list(). 113 | 114 | Args: 115 | list_of_any (list): A list of any data type that supports string conversion via str() to be anonymized. 116 | 117 | Returns (list): 118 | The anonymized list of strings encoded in base64. 119 | """ 120 | 121 | assert type(list_of_any) == list 122 | 123 | all_str = True 124 | for s in list_of_any: 125 | if type(s) != str: 126 | all_str = False 127 | break 128 | 129 | if all_str: 130 | return self.anonymize_list(list_of_any) 131 | 132 | return self.anonymize_list([str(e) for e in list_of_any]) 133 | 134 | def deanonymize_list(self, list_of_str): 135 | """Deanonymize a list of strings. 136 | 137 | Deanonymizes a list of anonymized strings recovering the original text. This can only be applied if 138 | the encryption is reversible (The object was created with `digest_bits = 0`) and the key is the same 139 | key used for encryption. 140 | 141 | Raises ValueError when called on an object that does hashing (is created with `digest_bits > 0`) 142 | rather than encryption. 143 | 144 | 145 | Args: 146 | list_of_str (list): A list of strings anonymized using a previous .anonymize_list() call. 147 | 148 | Returns (list): 149 | The original deanonymized list of strings. 150 | """ 151 | if self.digest_bits != 0: 152 | raise ValueError("deanonymize_list() requires passing 'digest_bits = 0' to the constructor.") 153 | 154 | l2 = list() 155 | 156 | aes = AESGCM(self.hash_key) 157 | 158 | if self.safe_crypto: 159 | for s in list_of_str: 160 | raw = base64.decodebytes(s.encode()) 161 | nonce = raw[0:12] 162 | cipher = raw[12:] 163 | 164 | l2.append(aes.decrypt(nonce, cipher, None).decode('utf-8')) 165 | else: 166 | nonce = b'12345678' 167 | for s in list_of_str: 168 | cipher = base64.decodebytes(s.encode()) 169 | 170 | l2.append(aes.decrypt(nonce, cipher, None).decode('utf-8')) 171 | 172 | return l2 173 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # mercury-dataschema 2 | 3 | [![](https://github.com/BBVA/mercury-dataschema/actions/workflows/test.yml/badge.svg)](https://github.com/BBVA/mercury-dataschema) 4 | ![](https://img.shields.io/badge/latest-1.1.2-blue) 5 | 6 | `mercury-dataschema` is a submodule of the Mercury library which acts as a utility tool that, given a Pandas DataFrame, its `DataSchema` class auto-infers feature types and automatically calculates different statistics depending on them. 7 | 8 | This type inference isn't solely based on data types but in the information the variables contain. For example: if a feature is encoded as a `float` but its cardinality is 2, we can be sure it's a binary feature. 9 | 10 | This package is used by other Mercury submodules, and you also can use it separately from the rest of the library. 11 | 12 | As an idea (there are plenty of them, though), it is particularly useful when preprocessing datasets. Having to specify the typical `categorical_cols` and `coninuous_cols` is over! 13 | 14 | ## Mercury project at BBVA 15 | 16 | Mercury is a collaborative library that was developed by the Advanced Analytics community at BBVA. Originally, it was created as an [InnerSource](https://en.wikipedia.org/wiki/Inner_source) project but after some time, we decided to release certain parts of the project as Open Source. 17 | That's the case with the `mercury-dataschema` package. 18 | 19 | If you're interested in learning more about the Mercury project, we recommend reading this blog [post](https://www.bbvaaifactory.com/mercury-acelerando-la-reutilizacion-en-ciencia-de-datos-dentro-de-bbva/) from www.bbvaaifactory.com 20 | 21 | ## User installation 22 | 23 | The easiest way to install `mercury-dataschema` is using ``pip``: 24 | 25 | pip install -U mercury-dataschema 26 | 27 | ## Example 28 | 29 | ```python 30 | from mercury.dataschema.schemagen import DataSchema 31 | from mercury.dataschema.feature import FeatType 32 | 33 | dataset = UCIDataset().load() # Any Dataframe 34 | 35 | schma = (DataSchema() # Generate a lazy Schema object 36 | .generate(dataset) # Manually trigger its construction (it mostly infers data types...) 37 | .calculate_statistics()) # Manually trigger extra statistic calculations for each feature 38 | ``` 39 | 40 | Then, we can inspect all the features with 41 | 42 | ```python 43 | schma.feats 44 | ``` 45 | 46 | ``` 47 | {'ID': Discrete Feature (NAME=None, dtype=DataType.INTEGER), 48 | 'LIMIT_BAL': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 49 | 'SEX': Binary Feature (NAME=None, dtype=DataType.INTEGER), 50 | 'EDUCATION': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 51 | 'MARRIAGE': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 52 | 'AGE': Discrete Feature (NAME=None, dtype=DataType.INTEGER), 53 | 'PAY_0': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 54 | 'PAY_2': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 55 | 'PAY_3': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 56 | 'PAY_4': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 57 | 'PAY_5': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 58 | 'PAY_6': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 59 | 'BILL_AMT1': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 60 | 'BILL_AMT2': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 61 | 'BILL_AMT3': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 62 | 'BILL_AMT4': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 63 | 'BILL_AMT5': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 64 | 'BILL_AMT6': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 65 | 'PAY_AMT1': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 66 | 'PAY_AMT2': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 67 | 'PAY_AMT3': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 68 | 'PAY_AMT4': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 69 | 'PAY_AMT5': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 70 | 'PAY_AMT6': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 71 | 'default.payment.next.month': Binary Feature (NAME=None, dtype=DataType.INTEGER)} 72 | ``` 73 | 74 | And we can get extra feature statistics by inspecting the .stats attribute of the `Feature` objects. 75 | 76 | ```python 77 | schma.feats['BILL_AMT4'].stats 78 | ``` 79 | 80 | ``` 81 | {'num_nan': 0, 82 | 'percent_nan': 0.0, 83 | 'samples': 30000, 84 | 'percent_unique': 0.7182666666666667, 85 | 'cardinality': 21548, 86 | 'min': -170000.0, 87 | 'max': 891586.0, 88 | 'distribution': [3.3333333333333335e-05, 89 | 0.0, 90 | 3.3333333333333335e-05, 91 | 0.0, 92 | 0.0, 93 | 3.3333333333333335e-05, 94 | 0.0, 95 | 3.3333333333333335e-05, 96 | 3.3333333333333335e-05, 97 | 0.0, 98 | 3.3333333333333335e-05, 99 | 6.666666666666667e-05, 100 | 6.666666666666667e-05, 101 | 0.00016666666666666666, 102 | ..., 103 | 0.0, 104 | 0.0, 105 | 0.0, 106 | 0.0, 107 | 0.0, 108 | 3.3333333333333335e-05], 109 | 'distribution_bins': [-170000.0, 110 | -163898.93103448275, 111 | -157797.8620689655, 112 | -151696.7931034483, 113 | ..., 114 | 867181.724137931, 115 | 873282.7931034482, 116 | 879383.8620689653, 117 | 885484.9310344828, 118 | 891586.0]} 119 | ``` 120 | 121 | ```python 122 | schma.feats 123 | ``` 124 | 125 | ``` 126 | {'ID': Discrete Feature (NAME=None, dtype=DataType.INTEGER), 127 | 'LIMIT_BAL': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 128 | 'SEX': Binary Feature (NAME=None, dtype=DataType.INTEGER), 129 | 'EDUCATION': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 130 | 'MARRIAGE': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 131 | 'AGE': Discrete Feature (NAME=None, dtype=DataType.INTEGER), 132 | 'PAY_0': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 133 | 'PAY_2': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 134 | 'PAY_3': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 135 | 'PAY_4': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 136 | 'PAY_5': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 137 | 'PAY_6': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 138 | 'BILL_AMT1': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 139 | 'BILL_AMT2': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 140 | 'BILL_AMT3': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 141 | 'BILL_AMT4': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 142 | 'BILL_AMT5': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 143 | 'BILL_AMT6': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 144 | 'PAY_AMT1': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 145 | 'PAY_AMT2': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 146 | 'PAY_AMT3': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 147 | 'PAY_AMT4': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 148 | 'PAY_AMT5': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 149 | 'PAY_AMT6': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 150 | 'default.payment.next.month': Binary Feature (NAME=None, dtype=DataType.INTEGER)} 151 | ``` 152 | 153 | Note how for different features, the computed statistics vary: 154 | 155 | ```python 156 | schma.feats['default.payment.next.month'].stats 157 | ``` 158 | 159 | ``` 160 | {'num_nan': 0, 161 | 'percent_nan': 0.0, 162 | 'samples': 30000, 163 | 'percent_unique': 6.666666666666667e-05, 164 | 'cardinality': 2, 165 | 'distribution': [0.7788, 0.2212], 166 | 'distribution_bins': [0, 1], 167 | 'domain': [1, 0]} 168 | ``` 169 | 170 | ## Saving and loading schemas 171 | 172 | You can serialize and reload `DataSchema`s so you can reuse them in the future. 173 | 174 | ```python 175 | PATH = 'schma.json' 176 | # Save the schema 177 | schma.save(PATH) 178 | 179 | # Load it back! 180 | recovered = DataSchema.load(PATH) 181 | ``` 182 | 183 | ## Help and support 184 | 185 | This library is currently maintained by a dedicated team of data scientists and machine learning engineers from BBVA AI Factory. 186 | 187 | ### Documentation 188 | website: https://bbva.github.io/mercury-dataschema/site/ 189 | 190 | ### Email 191 | mercury.group@bbva.com 192 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mercury-dataschema 2 | 3 | [![](https://github.com/BBVA/mercury-dataschema/actions/workflows/test.yml/badge.svg)](https://github.com/BBVA/mercury-dataschema) 4 | ![](https://img.shields.io/badge/latest-1.1.2-blue) 5 | [![Python 3.8](https://img.shields.io/badge/python-3.8-blue.svg)](https://www.python.org/downloads/release/python-3816/) 6 | [![Python 3.9](https://img.shields.io/badge/python-3.9-blue.svg)](https://www.python.org/downloads/release/python-3916/) 7 | [![Python 3.10](https://img.shields.io/badge/python-3.10-blue.svg)](https://www.python.org/downloads/release/python-31011/) 8 | [![Python 3.11](https://img.shields.io/badge/python-3.11-blue.svg)](https://www.python.org/downloads/release/python-3119/) 9 | [![Python 3.12](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/release/python-3128/) 10 | [![Python 3.13](https://img.shields.io/badge/python-3.13-blue.svg)](https://www.python.org/downloads/release/python-3131/) 11 | [![Apache 2 license](https://shields.io/badge/license-Apache%202-blue)](http://www.apache.org/licenses/LICENSE-2.0) 12 | [![Ask Me Anything !](https://img.shields.io/badge/Ask%20me-anything-1abc9c.svg)](https://github.com/BBVA/mercury-dataschema/issues) 13 | 14 | `mercury-dataschema` is a submodule of the Mercury library which acts as a utility tool that, given a Pandas DataFrame, its `DataSchema` class auto-infers feature types and automatically calculates different statistics depending on them. 15 | 16 | This type inference isn't solely based on data types but in the information the variables contain. For example: if a feature is encoded as a `float` but its cardinality is 2, we can be sure it's a binary feature. 17 | 18 | This package is used by other Mercury submodules, and you also can use it separately from the rest of the library. 19 | 20 | As an idea (there are plenty of them, though), it is particularly useful when preprocessing datasets. Having to specify the typical `categorical_cols` and `coninuous_cols` is over! 21 | 22 | ## Mercury project at BBVA 23 | 24 | Mercury is a collaborative library that was developed by the Advanced Analytics community at BBVA. Originally, it was created as an [InnerSource](https://en.wikipedia.org/wiki/Inner_source) project but after some time, we decided to release certain parts of the project as Open Source. 25 | That's the case with the `mercury-dataschema` package. 26 | 27 | If you're interested in learning more about the Mercury project, we recommend reading this blog [post](https://www.bbvaaifactory.com/mercury-acelerando-la-reutilizacion-en-ciencia-de-datos-dentro-de-bbva/) from www.bbvaaifactory.com 28 | 29 | ## User installation 30 | 31 | The easiest way to install `mercury-dataschema` is using ``pip``: 32 | 33 | pip install -U mercury-dataschema 34 | 35 | ## Example 36 | 37 | ```python 38 | from mercury.dataschema.schemagen import DataSchema 39 | from mercury.dataschema.feature import FeatType 40 | 41 | dataset = UCIDataset().load() # Any Dataframe 42 | 43 | schma = (DataSchema() # Generate a lazy Schema object 44 | .generate(dataset) # Manually trigger its construction (it mostly infers data types...) 45 | .calculate_statistics()) # Manually trigger extra statistic calculations for each feature 46 | ``` 47 | 48 | Then, we can inspect all the features with 49 | 50 | ```python 51 | schma.feats 52 | ``` 53 | 54 | ``` 55 | {'ID': Discrete Feature (NAME=None, dtype=DataType.INTEGER), 56 | 'LIMIT_BAL': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 57 | 'SEX': Binary Feature (NAME=None, dtype=DataType.INTEGER), 58 | 'EDUCATION': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 59 | 'MARRIAGE': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 60 | 'AGE': Discrete Feature (NAME=None, dtype=DataType.INTEGER), 61 | 'PAY_0': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 62 | 'PAY_2': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 63 | 'PAY_3': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 64 | 'PAY_4': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 65 | 'PAY_5': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 66 | 'PAY_6': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 67 | 'BILL_AMT1': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 68 | 'BILL_AMT2': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 69 | 'BILL_AMT3': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 70 | 'BILL_AMT4': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 71 | 'BILL_AMT5': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 72 | 'BILL_AMT6': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 73 | 'PAY_AMT1': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 74 | 'PAY_AMT2': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 75 | 'PAY_AMT3': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 76 | 'PAY_AMT4': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 77 | 'PAY_AMT5': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 78 | 'PAY_AMT6': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 79 | 'default.payment.next.month': Binary Feature (NAME=None, dtype=DataType.INTEGER)} 80 | ``` 81 | 82 | And we can get extra feature statistics by inspecting the .stats attribute of the `Feature` objects. 83 | 84 | ```python 85 | schma.feats['BILL_AMT4'].stats 86 | ``` 87 | 88 | ``` 89 | {'num_nan': 0, 90 | 'percent_nan': 0.0, 91 | 'samples': 30000, 92 | 'percent_unique': 0.7182666666666667, 93 | 'cardinality': 21548, 94 | 'min': -170000.0, 95 | 'max': 891586.0, 96 | 'distribution': [3.3333333333333335e-05, 97 | 0.0, 98 | 3.3333333333333335e-05, 99 | 0.0, 100 | 0.0, 101 | 3.3333333333333335e-05, 102 | 0.0, 103 | 3.3333333333333335e-05, 104 | 3.3333333333333335e-05, 105 | 0.0, 106 | 3.3333333333333335e-05, 107 | 6.666666666666667e-05, 108 | 6.666666666666667e-05, 109 | 0.00016666666666666666, 110 | ..., 111 | 0.0, 112 | 0.0, 113 | 0.0, 114 | 0.0, 115 | 0.0, 116 | 3.3333333333333335e-05], 117 | 'distribution_bins': [-170000.0, 118 | -163898.93103448275, 119 | -157797.8620689655, 120 | -151696.7931034483, 121 | ..., 122 | 867181.724137931, 123 | 873282.7931034482, 124 | 879383.8620689653, 125 | 885484.9310344828, 126 | 891586.0]} 127 | ``` 128 | 129 | ```python 130 | schma.feats 131 | ``` 132 | 133 | ``` 134 | {'ID': Discrete Feature (NAME=None, dtype=DataType.INTEGER), 135 | 'LIMIT_BAL': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 136 | 'SEX': Binary Feature (NAME=None, dtype=DataType.INTEGER), 137 | 'EDUCATION': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 138 | 'MARRIAGE': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 139 | 'AGE': Discrete Feature (NAME=None, dtype=DataType.INTEGER), 140 | 'PAY_0': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 141 | 'PAY_2': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 142 | 'PAY_3': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 143 | 'PAY_4': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 144 | 'PAY_5': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 145 | 'PAY_6': Categorical Feature (NAME=None, dtype=DataType.INTEGER), 146 | 'BILL_AMT1': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 147 | 'BILL_AMT2': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 148 | 'BILL_AMT3': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 149 | 'BILL_AMT4': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 150 | 'BILL_AMT5': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 151 | 'BILL_AMT6': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 152 | 'PAY_AMT1': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 153 | 'PAY_AMT2': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 154 | 'PAY_AMT3': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 155 | 'PAY_AMT4': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 156 | 'PAY_AMT5': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 157 | 'PAY_AMT6': Discrete Feature (NAME=None, dtype=DataType.FLOAT), 158 | 'default.payment.next.month': Binary Feature (NAME=None, dtype=DataType.INTEGER)} 159 | ``` 160 | 161 | Note how for different features, the computed statistics vary: 162 | 163 | ```python 164 | schma.feats['default.payment.next.month'].stats 165 | ``` 166 | 167 | ``` 168 | {'num_nan': 0, 169 | 'percent_nan': 0.0, 170 | 'samples': 30000, 171 | 'percent_unique': 6.666666666666667e-05, 172 | 'cardinality': 2, 173 | 'distribution': [0.7788, 0.2212], 174 | 'distribution_bins': [0, 1], 175 | 'domain': [1, 0]} 176 | ``` 177 | 178 | ## Example notebooks 179 | 180 | ```python 181 | from mercury.dataschema import create_tutorials 182 | 183 | create_tutorials('.') # Creates a folder with example notebooks in the current path. 184 | ``` 185 | 186 | ## Saving and loading schemas 187 | 188 | You can serialize and reload `DataSchema`s so you can reuse them in the future. 189 | 190 | ```python 191 | PATH = 'schma.json' 192 | # Save the schema 193 | schma.save(PATH) 194 | 195 | # Load it back! 196 | recovered = DataSchema.load(PATH) 197 | ``` 198 | 199 | ## Help and support 200 | 201 | This library is currently maintained by a dedicated team of data scientists and machine learning engineers from BBVA. 202 | 203 | ### Documentation 204 | website: https://bbva.github.io/mercury-dataschema/site/ 205 | 206 | ### Email 207 | mercury.group@bbva.com 208 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /mercury/dataschema/feature.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | import numpy as np 4 | import warnings 5 | 6 | 7 | class DataType(Enum): 8 | INTEGER = 'INTEGER' 9 | FLOAT = 'FLOAT' 10 | STRING = 'STRING' 11 | DATE = 'DATE' 12 | BOOL = 'BOOL' 13 | CATEGORICAL = 'CATEGORICAL' # for pandas categorical type 14 | UNKNOWN = 'UNKNOWN' 15 | 16 | 17 | class FeatType(Enum): 18 | BINARY = 'BINARY' 19 | CATEGORICAL = 'CATEGORICAL' 20 | DISCRETE = 'DISCRETE' 21 | CONTINUOUS = 'CONTINUOUS' 22 | UNKNOWN = 'UNKNOWN' 23 | 24 | 25 | class Feature: 26 | """ This class represents a generic feature within a schema. 27 | 28 | Args: 29 | name: Feature name 30 | dtype: Data type of the feature 31 | """ 32 | def __init__(self, 33 | name: str = None, 34 | dtype: DataType = None 35 | ): 36 | self.name = name 37 | self.dtype = dtype if dtype else DataType.UNKNOWN 38 | self.stats = {} 39 | self.cache = {} # Intermediate heavy calculations 40 | 41 | def build_stats(self, column, calculator = None): 42 | no_nan_col = column.dropna() 43 | uniques = no_nan_col.unique() 44 | self.stats['num_nan'] = int(column.isna().sum()) 45 | self.stats['percent_nan'] = float(self.stats['num_nan'] / len(column)) 46 | self.stats['samples'] = len(column) 47 | self.stats['percent_unique'] = float(len(uniques) / self.stats['samples']) 48 | self.stats['cardinality'] = len(uniques) 49 | self.cache['uniques'] = uniques.tolist() 50 | 51 | self.cache['no_nan_filtered'] = no_nan_col # TODO: This could be inefficient 52 | 53 | return self 54 | 55 | def __str__(self): 56 | return f"Feature (NAME={self.name}, dtype={self.dtype})" 57 | 58 | def __repr__(self): 59 | return self.__str__() 60 | 61 | def _get_enum_feat_type(self): 62 | return FeatType.UNKNOWN 63 | 64 | @property 65 | def as_enum(self): 66 | return self._get_enum_feat_type() 67 | 68 | def to_json(self) -> dict: 69 | stats_serialized = self.stats.copy() 70 | 71 | for key, val in stats_serialized.items(): 72 | if isinstance(val, int): 73 | stats_serialized[key] = int(val) 74 | if isinstance(val, bool): 75 | stats_serialized[key] = bool(val) 76 | if isinstance(val, float): 77 | stats_serialized[key] = float(val) 78 | 79 | return {'name': self.name, 'dtype': self.dtype.value, 'stats': stats_serialized, 'feat_type': self.as_enum.value} 80 | 81 | 82 | class BinaryFeature(Feature): 83 | """ This class represents a binary feature within a schema 84 | (i.e. only two possible values). 85 | 86 | Args: 87 | name (str): Feature name 88 | dtype (str): Data type of the feature 89 | """ 90 | def __init__(self, name = None, dtype = None ): 91 | 92 | super().__init__(name, dtype) 93 | 94 | def build_stats(self, column, calculator): 95 | super().build_stats(column, calculator) 96 | self.stats['domain'] = self.cache['uniques'] 97 | calculator.distribution(column, self, bins = 2) 98 | return self 99 | 100 | def __str__(self): 101 | return f"Binary Feature (NAME={self.name}, dtype={self.dtype})" 102 | 103 | def __repr__(self): 104 | return self.__str__() 105 | 106 | def _get_enum_feat_type(self): 107 | return FeatType.BINARY 108 | 109 | 110 | class CategoricalFeature(Feature): 111 | """ This class represents a categorical feature within a schema 112 | (i.e. only N possible values). 113 | 114 | Args: 115 | name (str): Feature name 116 | dtype (str): Data type of the feature 117 | """ 118 | def __init__(self, name = None, dtype = None): 119 | 120 | super().__init__(name, dtype) 121 | 122 | def build_stats(self, column, calculator): 123 | super().build_stats(column, calculator) 124 | limit = calculator.limit_categorical_perc 125 | 126 | if isinstance(limit, (int, float)): 127 | 128 | if limit <= 0 or limit >= 1: 129 | raise ValueError("Input Error: 'limit_categorical_perc' must be a float between 0 and 1") 130 | 131 | elif len(self.cache['uniques']) / self.stats['samples'] > limit: 132 | warnings.warn(f"{self.name} will be truncated in both statistics 'domain' and 'distribution' with the most frequent values") 133 | # We get the N most frequent values according to the dataset size 134 | self.stats['domain'] = list(column.value_counts().index[:int(limit * self.stats['samples'])]) 135 | 136 | else: # Low cardinality 137 | self.stats['domain'] = self.cache['uniques'] 138 | 139 | else: 140 | self.stats['domain'] = self.cache['uniques'] 141 | 142 | calculator.distribution(column, self) 143 | 144 | return self 145 | 146 | def __str__(self): 147 | return f"Categorical Feature (NAME={self.name}, dtype={self.dtype})" 148 | 149 | def __repr__(self): 150 | return self.__str__() 151 | 152 | def _get_enum_feat_type(self): 153 | return FeatType.CATEGORICAL 154 | 155 | 156 | class DiscreteFeature(Feature): 157 | """ This class represents a discrete feature within a schema 158 | (i.e. any number without decimals). 159 | 160 | Args: 161 | name (str): Feature name 162 | dtype (str): Data type of the feature 163 | """ 164 | def __init__(self, name = None, dtype = None): 165 | 166 | super().__init__(name, dtype) 167 | 168 | def build_stats(self, column, calculator): 169 | super().build_stats(column, calculator) 170 | calculator.min(column, self) 171 | calculator.max(column, self) 172 | calculator.distribution(column, self) 173 | return self 174 | 175 | def __str__(self): 176 | return f"Discrete Feature (NAME={self.name}, dtype={self.dtype})" 177 | 178 | def __repr__(self): 179 | return self.__str__() 180 | 181 | def _get_enum_feat_type(self): 182 | return FeatType.DISCRETE 183 | 184 | 185 | class ContinuousFeature(Feature): 186 | """ This class represents a continuous feature within a schema 187 | (e.g. a float). 188 | 189 | Args: 190 | name (str): Feature name 191 | dtype (str): Data type of the feature 192 | """ 193 | def __init__(self, name = None, dtype = None): 194 | 195 | super().__init__(name, dtype) 196 | 197 | def build_stats(self, column, calculator): 198 | super().build_stats(column, calculator) 199 | calculator.min(column, self) 200 | calculator.max(column, self) 201 | calculator.mean(column, self) 202 | calculator.std(column, self) 203 | calculator.distribution(column, self) 204 | return self 205 | 206 | def __str__(self): 207 | return f"Continuous Feature (NAME={self.name}, dtype={self.dtype})" 208 | 209 | def __repr__(self): 210 | return self.__str__() 211 | 212 | def _get_enum_feat_type(self): 213 | return FeatType.CONTINUOUS 214 | 215 | 216 | class FeatureFactory: 217 | 218 | def __init__(self): 219 | pass 220 | 221 | def infer_datatype(self, column: "pandas.Series", feature: Feature) -> DataType: # noqa: F821 222 | """ Finds out the data type of the column. 223 | 224 | Args: 225 | column: column which datatype will be inferred 226 | feature: Feature object. This is needed because we want to cache several internal 227 | operations, so future calls are faster. 228 | 229 | Returns: 230 | Returns the datatype of the column 231 | """ 232 | datatype = DataType.UNKNOWN 233 | 234 | if column.dtype.name == 'category': 235 | datatype = DataType.CATEGORICAL 236 | elif np.issubdtype(column, np.integer): 237 | datatype = DataType.INTEGER 238 | elif np.issubdtype(column, np.bool_): 239 | datatype = DataType.BOOL 240 | elif np.issubdtype(column, np.floating): 241 | datatype = DataType.FLOAT 242 | elif np.issubdtype(column, np.object_): 243 | sample = feature.cache['no_nan_filtered'].iloc[0] 244 | if type(sample) is str: 245 | datatype = DataType.STRING 246 | # TODO: Este tipo puede ser otro array 247 | # TODO: Este tipo puede ser un json (dict) 248 | # TODO: Este tipo puede ser un datetime 249 | 250 | return datatype 251 | 252 | def _build_dummy_feature(self, datatype: DataType, feat_type: FeatType, name: str) -> Feature: 253 | """ Returns a dummy and uninitialized feature. This method is not intended to be 254 | used apart from serialization purposes. 255 | """ 256 | feat = Feature() 257 | if feat_type == FeatType.BINARY: 258 | feat = BinaryFeature() 259 | if feat_type == FeatType.CATEGORICAL: 260 | feat = CategoricalFeature() 261 | if feat_type == FeatType.DISCRETE: 262 | feat = DiscreteFeature() 263 | if feat_type == FeatType.CONTINUOUS: 264 | feat = ContinuousFeature() 265 | feat.dtype = datatype 266 | feat.name = name 267 | 268 | return feat 269 | 270 | def _infer_feature_type_from_float(self, feat, threshold_categorical, colname, verbose = False): 271 | if (feat.cache['no_nan_filtered'] % 1 == 0).all(): # The float column doesn't contain decimals 272 | if (feat.stats['percent_unique'] < threshold_categorical): 273 | # Case Categorical as float 274 | if verbose: 275 | warnings.warn( 276 | f"""FLOAT feature {colname} converted to Categorical because percentage of unique """ 277 | f"""values {feat.stats['percent_unique']} is lower than threshold {threshold_categorical}""", 278 | RuntimeWarning 279 | ) 280 | return FeatType.CATEGORICAL 281 | 282 | # Case Discrete as Float 283 | return FeatType.DISCRETE 284 | 285 | # If it does contain decimals, directly create Continuous Feature (categorical feature would rarely be 286 | # codified as floats with decimals 287 | return FeatType.CONTINUOUS 288 | 289 | def _infer_feature_type_from_int(self, feat, threshold_categorical, colname, verbose = False): 290 | if feat.stats['percent_unique'] >= threshold_categorical: 291 | return FeatType.DISCRETE 292 | else: 293 | if verbose: 294 | warnings.warn( 295 | f"""INTEGER feature {colname} converted to Categorical because percentage of unique """ 296 | f"""values {feat.stats['percent_unique']} is lower than threshold {threshold_categorical}""", 297 | RuntimeWarning 298 | ) 299 | return FeatType.CATEGORICAL 300 | 301 | def build_feature(self, 302 | column: 'pandas.Series', # noqa: F821 303 | colname: str = None, 304 | threshold_categorical: float = 1e-5, 305 | force_feat_type: FeatType = None, 306 | verbose: bool = True 307 | ) -> Feature: 308 | """ Builds a schema Feature object given a column. 309 | 310 | Args: 311 | column: Column to be analyzed 312 | colname: Name of the column (feature) 313 | threshold_categorical: percentage of necessary unique values for a feature to be considered 314 | categorical. If the percentage of unique values < cat_threshold, the 315 | column will be taken as categorical. This parameter can be a single float 316 | (same threshold for all columns) or a dict in which each key is the name of 317 | the column. Use the later for custom thresholds per column. 318 | force_feat_type: If user wants to force a variable to be of certain type, he/she can use 319 | this parameter and its type will not be auto-inferred, but set to this. 320 | verbose: If this is set to False, possible inner warnings won't be shown. 321 | 322 | Returns: 323 | Feature with only the base statistics calculated 324 | """ 325 | feat = Feature().build_stats(column) 326 | datatype = self.infer_datatype(column, feat) 327 | feat_type = FeatType.UNKNOWN 328 | 329 | # If user forces the feature type we kindly fulfill his/her wishes 330 | if force_feat_type is not None: 331 | feat = self._build_dummy_feature(datatype, force_feat_type, colname) 332 | feat.stats.update(feat.stats) 333 | return feat 334 | 335 | if feat.stats['cardinality'] == 2: 336 | feat_type = FeatType.BINARY 337 | else: 338 | # Data could still be either categorical, discrete or continuous 339 | if datatype is DataType.FLOAT: 340 | feat_type = self._infer_feature_type_from_float(feat, threshold_categorical, colname, verbose = verbose) 341 | 342 | if datatype is DataType.INTEGER: 343 | feat_type = self._infer_feature_type_from_int(feat, threshold_categorical, colname, verbose = verbose) 344 | 345 | if (datatype is DataType.STRING) or (datatype is DataType.CATEGORICAL): 346 | feat_type = FeatType.CATEGORICAL 347 | 348 | featret = self._build_dummy_feature(datatype, feat_type, colname) 349 | featret.stats.update(feat.stats) 350 | return featret 351 | -------------------------------------------------------------------------------- /tests/dataschema/test_schemagen.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import seaborn as sns 4 | import pandas as pd 5 | import numpy as np 6 | 7 | from mercury.dataschema.feature import ( 8 | CategoricalFeature, 9 | BinaryFeature, 10 | ContinuousFeature, 11 | DiscreteFeature, 12 | DataType 13 | ) 14 | from mercury.dataschema import DataSchema 15 | from mercury.dataschema.feature import DataType, FeatType 16 | from mercury.dataschema.anonymize import Anonymize 17 | 18 | 19 | @pytest.fixture(scope='module') 20 | def datasets(): 21 | tips = sns.load_dataset('tips') 22 | tips['sex'] = tips['sex'].astype(str) 23 | tips['smoker'] = tips['smoker'].astype(str) 24 | tips['day'] = tips['day'].astype(str) 25 | tips['time'] = tips['time'].astype(str) 26 | 27 | titanic = sns.load_dataset('titanic') 28 | isna_deck = titanic.deck.isna() 29 | titanic['class'] = titanic['class'].astype(str) 30 | titanic['deck'] = titanic['deck'].astype(str) 31 | titanic['who'] = titanic['who'].astype(str) 32 | titanic.loc[isna_deck, 'deck'] = np.nan 33 | 34 | return tips, titanic 35 | 36 | 37 | def test_dataschema_build(datasets): 38 | tips, titanic = datasets 39 | 40 | schma = DataSchema().generate(tips) 41 | 42 | assert isinstance(schma.feats['sex'], BinaryFeature) 43 | assert isinstance(schma.feats['smoker'], BinaryFeature) 44 | assert isinstance(schma.feats['time'], BinaryFeature) 45 | assert isinstance(schma.feats['size'], CategoricalFeature) 46 | assert isinstance(schma.feats['day'], CategoricalFeature) 47 | assert isinstance(schma.feats['total_bill'], ContinuousFeature) 48 | assert isinstance(schma.feats['tip'], ContinuousFeature) 49 | 50 | assert schma.feats['sex'].name == 'sex' 51 | assert schma.feats['size'].name == 'size' 52 | assert schma.feats['total_bill'].name == 'total_bill' 53 | 54 | schma = DataSchema().generate(titanic) 55 | assert isinstance(schma.feats['deck'], CategoricalFeature) 56 | assert(schma.feats['deck'].stats['percent_nan'] > 0) 57 | assert(schma.feats['adult_male'].dtype == DataType.BOOL) 58 | assert schma.feats['adult_male'].name == 'adult_male' 59 | 60 | 61 | def test_dataschema_stats(datasets): 62 | tips, titanic = datasets 63 | 64 | schma = DataSchema().generate(tips).calculate_statistics() 65 | 66 | assert schma.feats['tip'].stats['min'] == 1.0 67 | assert schma.feats['tip'].stats['max'] == 10.0 68 | assert schma.feats['tip'].stats['mean'] == pytest.approx(2.99827868852459) 69 | assert schma.feats['tip'].stats['percent_unique'] == pytest.approx(0.5040983606557377) 70 | 71 | schma = DataSchema().generate(titanic).calculate_statistics() 72 | assert schma.feats['sex'].stats['distribution_bins'][0] == 'female' 73 | assert schma.feats['sex'].stats['distribution_bins'][1] == 'male' 74 | assert schma.feats['sex'].stats['distribution'][0] == pytest.approx(0.35655738, 0.1) 75 | 76 | 77 | def test_dataschema_stats_custom_params(datasets): 78 | _, titanic = datasets 79 | schma = DataSchema().generate(titanic).calculate_statistics() 80 | assert len(schma.feats['age'].stats['distribution']) > 10 81 | 82 | schma = DataSchema().generate(titanic).calculate_statistics({'distribution_bins_method': 5}) 83 | assert len(schma.feats['age'].stats['distribution']) == 5 84 | 85 | schma = DataSchema().generate(titanic).calculate_statistics({ 86 | 'age': {'distribution_bins_method': 5}, 87 | 'fare': {'distribution_bins_method': 3} 88 | }) 89 | assert len(schma.feats['age'].stats['distribution']) == 5 90 | assert len(schma.feats['fare'].stats['distribution']) == 3 91 | 92 | titanic = titanic.reset_index().rename(columns = {'index':'ID'}) 93 | titanic["ID"] = titanic["ID"].astype(str) 94 | schma = (DataSchema().generate(titanic).calculate_statistics({'limit_categorical_perc': 0.05})) 95 | assert len(schma.feats['ID'].stats['domain']) == 44 96 | 97 | with pytest.raises(ValueError) as e: 98 | schma = (DataSchema().generate(titanic).calculate_statistics({'limit_categorical_perc': 5})) 99 | assert "Input Error: 'limit_categorical_perc' must be a float between 0 and 1" in str(e.value) 100 | 101 | 102 | def test_dataschema_stats_anonymize(datasets, tmpdir): 103 | 104 | _, titanic = datasets 105 | test_feat = "class" 106 | 107 | #Original 108 | schema_orig = DataSchema().generate(titanic, verbose = False).calculate_statistics() 109 | assert sorted(schema_orig.feats[test_feat].stats['domain']) == ['First', 'Second', 'Third'] 110 | 111 | #Anonymized 112 | an_encrypt = Anonymize(0) 113 | an_encrypt.set_key("07jaPY") 114 | anon_dict = {test_feat : an_encrypt} 115 | schma_an = schema_orig.anonymize(anonymize_params=anon_dict) 116 | an_domain = schma_an.feats[test_feat].stats['domain'] 117 | an_dist_bins = schma_an.feats[test_feat].stats['distribution_bins'] 118 | assert 'First' not in an_domain and 'Second' not in an_domain and 'Third' not in an_domain 119 | assert 'First' not in an_dist_bins and 'Second' not in an_dist_bins and 'Third' not in an_dist_bins 120 | 121 | 122 | def test_errors_dataschema_anonymize(datasets): 123 | tips, titanic = datasets 124 | 125 | schma = DataSchema().generate(titanic) 126 | with pytest.raises(UserWarning) as w: 127 | schma.anonymize({}) 128 | assert "To anonymise, it is necessary to use a dictionary with the format: {'var1':anonymizer1, 'var2':anonymizer2}" in str(w.value) 129 | 130 | an_encrypt = Anonymize(0) 131 | an_encrypt.set_key("07jaPY") 132 | with pytest.raises(ValueError) as e: 133 | schma.anonymize({'fare' : an_encrypt}) 134 | assert "Input Error: Anonymize only supports Categorical or Binary variables ->" in str(e.value) 135 | 136 | with pytest.raises(ValueError) as e: 137 | schma.anonymize({'farer' : an_encrypt}) 138 | assert "Input Error: Keys of 'anonymize_params' dictionary must be columns name of the data schema" in str(e.value) 139 | 140 | with pytest.raises(UserWarning) as w: 141 | schma.deanonymize({}) 142 | assert "To De-anonymise, it is necessary to use a dictionary with the format: {'var1':anonym1, 'var2':anonym2}" 143 | 144 | with pytest.raises(ValueError) as e: 145 | schma.deanonymize({'fare' : an_encrypt}) 146 | assert "Input Error: Deanonymize only supports Categorical or Binary variables ->" in str(e.value) 147 | 148 | with pytest.raises(ValueError) as e: 149 | schma.deanonymize({'farer' : an_encrypt}) 150 | assert "Input Error: Deanonymize only supports Categorical or Binary variables ->" in str(e.value) 151 | 152 | def test_dataschema_properties(datasets): 153 | tips, titanic = datasets 154 | 155 | schma = DataSchema().generate(titanic) 156 | assert ['pclass', 'sibsp', 'parch', 'embarked', 'class', 'who', 'deck', 'embark_town'] == schma.categorical_feats 157 | assert ['age', 'fare'] == schma.continuous_feats 158 | assert ['survived', 'sex', 'adult_male', 'alive', 'alone'] == schma.binary_feats 159 | assert len(schma.discrete_feats) == 0 160 | 161 | 162 | def test_generate_manual(datasets): 163 | tips, titanic = datasets 164 | schma = DataSchema().generate_manual( 165 | titanic, 166 | categ_columns=['class'], 167 | discrete_columns=['age'], 168 | binary_columns=['survived', 'alive'] 169 | ) 170 | 171 | for key, item in schma.feats.items(): 172 | if key == 'class': 173 | assert isinstance(item, CategoricalFeature) 174 | 175 | if key == 'age': 176 | assert isinstance(item, DiscreteFeature) 177 | 178 | if key == 'alive' or key == 'survived': 179 | assert isinstance(item, BinaryFeature) 180 | 181 | if key not in ('class', 'age', 'alive', 'survived'): 182 | assert isinstance(item, ContinuousFeature) 183 | 184 | # assert everything is continuous by default 185 | schma = DataSchema().generate_manual( 186 | titanic, 187 | categ_columns=[], 188 | discrete_columns=[], 189 | binary_columns=[] 190 | ) 191 | 192 | for _, item in schma.feats.items(): 193 | assert isinstance(item, ContinuousFeature) 194 | 195 | 196 | def test_validate(datasets): 197 | tips, titanic = datasets 198 | 199 | titanic2 = titanic.copy() 200 | titanic2['deck'] = 0 201 | 202 | schma = DataSchema().generate(titanic) 203 | schma2 = DataSchema().generate(titanic2) 204 | 205 | with pytest.raises(RuntimeError) as exinfo: 206 | schma.validate(schma2) 207 | 208 | assert "Data types types do not match. 'deck' in other is DataType.INTEGER. However, DataType.STRING is expected." in str(exinfo.value) 209 | 210 | titanic2 = titanic.drop('deck', axis=1) 211 | schma2 = DataSchema().generate(titanic2) 212 | 213 | with pytest.raises(RuntimeError) as exinfo: 214 | schma.validate(schma2) 215 | 216 | assert "Features do not match." in str(exinfo.value) 217 | 218 | 219 | def test_serialization(datasets, tmpdir): 220 | tips, titanic = datasets 221 | 222 | schma = DataSchema().generate(titanic).calculate_statistics() 223 | path = str(tmpdir) + '/schema.json' 224 | schma.save(path) 225 | recovered = DataSchema.load(path) 226 | 227 | # If any of this fail, the serialization is wrong 228 | schma.validate(recovered) 229 | recovered.validate(schma) 230 | 231 | 232 | def test_get_features_by_type(datasets): 233 | tips, titanic = datasets 234 | schema = DataSchema().generate(titanic) 235 | 236 | str_feats = {'class', 'alive', 'deck', 'embark_town', 'embarked', 'sex', 'who'} 237 | float_feats = {'age', 'fare'} 238 | assert set(schema.get_features_by_type(DataType.STRING)) == set(str_feats) 239 | assert set(schema.get_features_by_type(DataType.FLOAT)) == set(float_feats) 240 | 241 | 242 | def test_subtypes(): 243 | # Test added after bug discovery that float32 were not assigned to continuous 244 | 245 | df = pd.DataFrame(data={"float": np.random.uniform(size=1000)}) 246 | df["float_64"] = df["float"].astype(np.float64) 247 | df["float_32"] = df["float"].astype(np.float32) 248 | df["float_16"] = df["float"].astype(np.float16) 249 | df["int_64"] = (df["float_64"] * 10000).astype(np.int64) 250 | df["int_32"] = (df["float_64"] * 10000).astype(np.int32) 251 | df["int_16"] = (df["float_64"] * 10000).astype(np.int16) 252 | df["uint_64"] = (df["float_64"] * 10000).astype(np.uint64) 253 | df["uint_32"] = (df["float_64"] * 10000).astype(np.uint32) 254 | df["uint_16"] = (df["float_64"] * 10000).astype(np.uint16) 255 | 256 | schema = DataSchema().generate(df) 257 | 258 | assert all(elem in schema.continuous_feats for elem in ['float', 'float_64', 'float_32', 'float_16']) 259 | assert all(elem in schema.discrete_feats for elem in ['int_64', 'int_32', 'int_16']) 260 | 261 | 262 | assert isinstance(schema.feats['float_64'], ContinuousFeature) 263 | assert isinstance(schema.feats['float_32'], ContinuousFeature) 264 | assert isinstance(schema.feats['float_16'], ContinuousFeature) 265 | assert isinstance(schema.feats['int_64'], DiscreteFeature) 266 | assert isinstance(schema.feats['int_32'], DiscreteFeature) 267 | assert isinstance(schema.feats['int_16'], DiscreteFeature) 268 | 269 | 270 | assert schema.feats['float_64'].dtype == DataType.FLOAT 271 | assert schema.feats['float_32'].dtype == DataType.FLOAT 272 | assert schema.feats['float_16'].dtype == DataType.FLOAT 273 | assert schema.feats['int_64'].dtype == DataType.INTEGER 274 | assert schema.feats['int_32'].dtype == DataType.INTEGER 275 | assert schema.feats['int_16'].dtype == DataType.INTEGER 276 | 277 | 278 | def test_pandas_categorical_type(): 279 | # Test added after bug discovery that schemas with dataframes with categorical type raise Exception 280 | 281 | df = pd.DataFrame(data={ 282 | 'categorical_int': np.random.choice([0,1,2,3], size=100), 283 | 'categorical_str': np.random.choice(["A", "B", "C", "D"], size=100) 284 | }) 285 | df["categorical_int"] = df["categorical_int"].astype("category") 286 | df["categorical_str"] = df["categorical_str"].astype("category") 287 | 288 | schema = DataSchema().generate(df) 289 | assert all(elem in schema.categorical_feats for elem in ['categorical_int', 'categorical_str']) 290 | assert isinstance(schema.feats['categorical_int'], CategoricalFeature) 291 | assert isinstance(schema.feats['categorical_str'], CategoricalFeature) 292 | 293 | 294 | def test_float_conversions(): 295 | 296 | df = pd.DataFrame(data={ 297 | 'float_categorical': np.random.choice([0., 1., 2.], size=1000), 298 | 'float_discrete': np.random.randint(0, 10000, size=1000).astype(float), 299 | 'float_continous': np.random.uniform(0, 10000, size=1000) 300 | }) 301 | schema = DataSchema().generate(df) 302 | assert isinstance(schema.feats['float_categorical'], CategoricalFeature) 303 | assert isinstance(schema.feats['float_discrete'], DiscreteFeature) 304 | assert isinstance(schema.feats['float_continous'], ContinuousFeature) 305 | 306 | 307 | def test_categorical_and_numerical_user_assignation(): 308 | 309 | # Test cat_feats and num_feats params to manually assign feature types to avoid automatic inference 310 | df = pd.DataFrame(data={ 311 | 'float_categorical': np.random.choice([0., 1., 2.], size=1000), 312 | 'float_discrete': np.random.randint(0, 10000, size=1000).astype(float), 313 | 'float_continous': np.random.uniform(0, 10000, size=1000), 314 | 'int_categorical': np.random.choice([0, 1, 2], size=1000), 315 | 'int_discrete': np.random.randint(0, 10000, size=1000) 316 | }) 317 | 318 | # Generate initially with automatic inference 319 | schema = DataSchema().generate(df) 320 | assert isinstance(schema.feats['float_categorical'], CategoricalFeature) 321 | assert isinstance(schema.feats['float_discrete'], DiscreteFeature) 322 | assert isinstance(schema.feats['float_continous'], ContinuousFeature) 323 | assert isinstance(schema.feats['int_categorical'], CategoricalFeature) 324 | assert isinstance(schema.feats['int_discrete'], DiscreteFeature) 325 | 326 | # Generate now with manual assignation of data datatypes 327 | cat_feats = ['float_discrete', 'float_continous', 'int_discrete'] 328 | num_feats = ['float_categorical', 'int_categorical'] 329 | schema = DataSchema().generate( 330 | df, 331 | force_types=dict({c: FeatType.CATEGORICAL for c in cat_feats}, **{n: FeatType.DISCRETE for n in num_feats}) 332 | ) 333 | # assert isinstance(schema.feats['float_categorical'], DiscreteFeature) 334 | # assert isinstance(schema.feats['float_discrete'], CategoricalFeature) 335 | # assert isinstance(schema.feats['float_continous'], CategoricalFeature) 336 | # assert isinstance(schema.feats['int_categorical'], DiscreteFeature) 337 | # assert isinstance(schema.feats['int_discrete'], CategoricalFeature) 338 | # 339 | # # Specifying already most correct features types, keeps them 340 | # schema = DataSchema().generate( 341 | # df, 342 | # cat_feats=['float_categorical', 'int_categorical'], 343 | # num_feats=['int_discrete', 'float_discrete', 'float_continous'] 344 | # ) 345 | # assert isinstance(schema.feats['float_categorical'], CategoricalFeature) 346 | # assert isinstance(schema.feats['float_discrete'], DiscreteFeature) 347 | # assert isinstance(schema.feats['float_continous'], ContinuousFeature) 348 | # assert isinstance(schema.feats['int_categorical'], CategoricalFeature) 349 | # assert isinstance(schema.feats['int_discrete'], DiscreteFeature) 350 | 351 | # If a feature is specified both as numerical and categorical, then an exception is raised 352 | # with pytest.raises(ValueError) as exinfo: 353 | # schema = DataSchema().generate(df, cat_feats=['float_discrete'], num_feats=['float_discrete']) 354 | 355 | # String column as a numeric doesn't change it (raises warning) 356 | df['str_float_categorical'] = df['float_categorical'].astype(str) 357 | schema = DataSchema().generate(df) 358 | assert isinstance(schema.feats['str_float_categorical'], CategoricalFeature) 359 | 360 | -------------------------------------------------------------------------------- /mercury/dataschema/schemagen.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | 4 | from typing import Union, List, Dict 5 | 6 | from .feature import ( 7 | FeatureFactory, 8 | ContinuousFeature, 9 | DiscreteFeature, 10 | CategoricalFeature, 11 | BinaryFeature 12 | ) 13 | 14 | from .feature import ( 15 | DataType, 16 | FeatType 17 | ) 18 | 19 | from .calculator import StatCalculatorFactory 20 | 21 | 22 | class DataSchema: 23 | """ Dataset schema 24 | 25 | This class takes a dataframe and generates its schema as a collection of feature. 26 | Feature objects. Each one of them will contain metadata and statistics about a 27 | column of the original dataframe that can be further exploded. 28 | 29 | 30 | Example: 31 | ```python 32 | >>> schma = DataSchema()\ 33 | >>> .generate(dataset)\ 34 | >>> .calculate_statistics() 35 | 'DISBURSED_AMOUNT': Categorical Feature (NAME=DISBURSED_AMOUNT, dtype=DataType.INTEGER), 36 | 'ASSET_COST': Categorical Feature (NAME=ASSET_COST, dtype=DataType.INTEGER), 37 | 'LTV': Continuous Feature (NAME=LTV, dtype=DataType.FLOAT), 38 | 'BUREAU_SCORE': Discrete Feature (NAME=BUREAU_SCORE, dtype=DataType.INTEGER), 39 | 'BUREAU_SCORE_DESCRIPTION': Categorical Feature (NAME=BUREAU_SCORE_DESCRIPTION, dtype=DataType.STRING), 40 | 'NEW_LOANS_IN_LAST_SIX_MONTHS': Discrete Feature (NAME=NEW_LOANS_IN_LAST_SIX_MONTHS, dtype=DataType.INTEGER), 41 | 'DEFAULTED_LOANS_IN_LAST_SIX_MONTHS': Discrete Feature (NAME=DEFAULTED_LOANS_IN_LAST_SIX_MONTHS, dtype=DataType.INTEGER), 42 | 'NUM_LOANS_TAKEN': Discrete Feature (NAME=NUM_LOANS_TAKEN, dtype=DataType.INTEGER), 43 | 'NUM_ACTIVE_LOANS': Discrete Feature (NAME=NUM_ACTIVE_LOANS, dtype=DataType.INTEGER), 44 | 'NUM_DEFAULTED_LOANS': Discrete Feature (NAME=NUM_DEFAULTED_LOANS, dtype=DataType.INTEGER), 45 | 'AGE': Discrete Feature (NAME=AGE, dtype=DataType.INTEGER), 46 | 'GENDER': Binary Feature (NAME=GENDER, dtype=DataType.STRING), 47 | 'CIVIL_STATUS': Categorical Feature (NAME=CIVIL_STATUS, dtype=DataType.STRING), 48 | 'ORIGIN': Binary Feature (NAME=ORIGIN, dtype=DataType.STRING), 49 | 'DIGITAL': Binary Feature (NAME=DIGITAL, dtype=DataType.INTEGER), 50 | 'SCORE': Continuous Feature (NAME=SCORE, dtype=DataType.FLOAT), 51 | 'PREDICTION': Binary Feature (NAME=PREDICTION, dtype=DataType.INTEGER)} 52 | >>> schma.feats['SCORE'].stats 53 | {'num_nan': 0, 54 | 'percent_nan': 0.0, 55 | 'samples': 233154, 56 | 'percent_unique': 0.7967352050576014, 57 | 'cardinality': 185762, 58 | 'min': 0.17454321487679067, 59 | 'max': 0.9373813084029072, 60 | 'mean': 0.7625553210045813, 61 | 'std': 0.15401509786623635, 62 | 'distribution': array([7.48617716e-07, 1.07579979e-06, 1.40298186e-06, 1.73016394e-06, 63 | 2.05734601e-06, 2.38452809e-06, 2.71171016e-06, 3.03889224e-06, 64 | 3.36607431e-06, 3.69325638e-06, 4.02043846e-06])} 65 | # Specifying custom parameters (shared among all features) for the calculate_statistics method 66 | >>> schma = DataSchema()\ 67 | ... .generate(dataset)\ 68 | ... .calculate_statistics({'distribution_bins_method': 'sqrt'}) # Specify bin generation method (see numpy.hist) 69 | 70 | # We can also specify granular statistic parameters per variable 71 | >>> schma = DataSchema()\ 72 | ... .generate(dataset)\ 73 | ... .calculate_statistics({'SCORE': {'distribution_bins_method': 'sqrt'}}) # Specify bin generation method (see numpy.hist) 74 | 75 | >>> schma = DataSchema()\ 76 | ... .generate(dataset)\ 77 | ... .calculate_statistics({'SCORE': {'distribution_bins_method': 5}}) # Specify 5 bins only for numerical features 78 | ``` 79 | """ 80 | def __init__(self): 81 | self.dataframe = None 82 | self.feats = {} 83 | self._feat_factory = None 84 | self._generated = False 85 | 86 | def generate_manual( 87 | self, 88 | dataframe: Union["pandas.DataFrame", "pyspark.sql.DataFrame"], # noqa: F821 89 | categ_columns: List[str], 90 | discrete_columns: List[str], 91 | binary_columns: List[str], 92 | custom_stats: dict = None, 93 | ) -> "DataSchema": 94 | """ Builds the schema manually. This acts like `generate()` but in a more restrictive way. 95 | All the names passed to `categ_columns` will be taken as categorical features, no more, no less. 96 | It will avoid making automatic type inference on every feature not in `categ_columns`. 97 | The same rule is applied on `discrete_columns`. 98 | 99 | Note: 100 | This method is considered to be low level. If you use this, make sure the type assignment 101 | to each feature type is compatible with the datatypes (float, int, string,...) in the column or 102 | a later call to `calculate_statistics` could fail. 103 | 104 | Args: 105 | dataframe (pd.DataFrame): DataFrame on which the schema will be inferred. 106 | categ_columns (List[str]): list of columns which will be forced to be taken as categorical. Warning: 107 | all features not in this list are guaranteed not being categorical 108 | discrete_columns (List[str]): list of columns which will be forced to be taken as discrete. Warning: 109 | all features not in this list are guaranteed not to be taken as discrete (i.e. 110 | they will be continuous). 111 | binary_columns (List[str]): list of column which will be forced to be taken as binary. 112 | custom_stats (Optional[Dict[str, Any]]): Custom statistics to be calculated for each column. 113 | """ 114 | force_types = {} 115 | for col in dataframe.columns: 116 | if col in categ_columns: 117 | force_types[col] = FeatType.CATEGORICAL 118 | else: 119 | # Is in either binary, continuous or discrete lists 120 | if col in discrete_columns: 121 | force_types[col] = FeatType.DISCRETE 122 | elif col in binary_columns: 123 | force_types[col] = FeatType.BINARY 124 | else: 125 | force_types[col] = FeatType.CONTINUOUS 126 | 127 | return self.generate( 128 | dataframe=dataframe, 129 | force_types=force_types, 130 | verbose=False, 131 | custom_stats=custom_stats 132 | ) 133 | 134 | def generate( 135 | self, 136 | dataframe: Union["pandas.DataFrame", "pyspark.sql.DataFrame"], # noqa: F821 137 | force_types: Dict[str, FeatType] = None, 138 | custom_stats: dict = None, 139 | verbose: bool = True, 140 | ) -> "DataSchema": 141 | """ Builds the schema. For float and integer datatypes, by default the method tries to infer 142 | if a feature is categorical or numeric (Continuous or Discrete) depending on the percentage 143 | of unique values. However, that doesn't work in all the cases. In those cases, you can use 144 | the `force_types` param to specify which features should be categorical and which 145 | should be numeric independently of the percentage of unique values. 146 | 147 | Args: 148 | dataframe: DataFrame on which the schema will be inferred. 149 | force_types: Dictionary with the form that contains the features to be 150 | forced to a specific type (Continuous, Discrete, Categorical...) 151 | custom_stats: Custom statistics to be calculated for each column 152 | verbose: whether to show or filter all possible warning messages 153 | """ 154 | if "pyspark" in str(type(dataframe)): 155 | raise RuntimeError("Sorry, Pyspark is not supported yet...") 156 | 157 | self.dataframe = dataframe 158 | self._generated = True 159 | 160 | self._feat_factory = FeatureFactory() 161 | 162 | inferring_types = True if force_types is None else False 163 | 164 | for col in self.dataframe.columns: 165 | thresh = self._get_threshold(len(self.dataframe)) 166 | 167 | # Look if the feature type has been specified 168 | forced_type = None 169 | if not inferring_types and col in force_types: 170 | forced_type = force_types[col] 171 | 172 | feat = self._feat_factory.build_feature( 173 | self.dataframe.loc[:, col], 174 | col, 175 | force_feat_type=forced_type, 176 | threshold_categorical=thresh, 177 | verbose=inferring_types and verbose # Only show warnings (if any) when using default args. 178 | ) 179 | self.feats[col] = feat 180 | 181 | return self 182 | 183 | def anonymize(self, anonymize_params: dict) -> "DataSchema": 184 | """ 185 | Anonymize the selected features of a data schema. 186 | 187 | Args: 188 | anonymize_params: Dictionary where the keys are the names of the columns to be anonymized and the values 189 | are mercury.contrib.dataschema.Anonymize objects that can be used to anonymize them. 190 | Raises: 191 | UserWarning, if anonymize_params is empty. 192 | ValueError, if the feature selected to deanonymize is not binary or categorical, or is not a feature of the dataschema. 193 | """ 194 | if not anonymize_params: 195 | raise UserWarning("To anonymize, it is necessary to use a dictionary with the format: {'var1':anonymizer1, 'var2':anonymizer2}") 196 | 197 | if any(feat not in self.feats.keys() for feat in anonymize_params.keys()): 198 | raise ValueError("Input Error: Keys of 'anonymize_params' dictionary must be columns name of the data schema") 199 | 200 | for feature in list(self.feats.keys()): 201 | anon = anonymize_params.get(feature) 202 | 203 | if anon: 204 | if not isinstance(self.feats[feature], (BinaryFeature, CategoricalFeature)): 205 | raise ValueError(f"Input Error: Anonymze only supports Categorical or Binary variables -> {feature}, You can use \ 206 | the `force_types` param in 'generate()' to specify which features should be categorical ") 207 | else: 208 | self.feats[feature].stats['distribution_bins'] = anon.\ 209 | anonymize_list_any_type(list(self.feats[feature].stats['distribution_bins'])) 210 | self.feats[feature].stats['domain'] = anon.\ 211 | anonymize_list_any_type(list(self.feats[feature].stats['domain'])) 212 | 213 | return self 214 | 215 | def deanonymize(self, anonymize_params: dict) -> "DataSchema": 216 | """ 217 | De-anonymize the selected features on a preloaded schema. 218 | 219 | Raises UserWarning, if anonymize_params is empty. 220 | Raises ValueError, if the feature selected to deanonymize is not binary or categorical, or is not a feature of the dataschema. 221 | 222 | Args: 223 | anonymize_params: Dictionary where the keys are the names of the columns to be deanonymized and the values 224 | are mercury.contrib.dataschema.Anonymize objects that can be used to deanonymize them. 225 | """ 226 | if not anonymize_params: 227 | raise UserWarning("To De-anonymize, it is necessary to use a dictionary with the format: {'var1':anonym1, 'var2':anonym2}") 228 | 229 | if any(feat not in self.feats.keys() for feat in anonymize_params.keys()): 230 | raise ValueError("Input Error: Keys of 'anonymize_params' dictionary must be columns name of the data schema") 231 | 232 | for feature in list(self.feats.keys()): 233 | anon = anonymize_params.get(feature) 234 | 235 | if anon: 236 | 237 | if not isinstance(self.feats[feature], (BinaryFeature, CategoricalFeature)): 238 | raise ValueError(f"Input Error: Deanonymize only supports Categorical or Binary variables -> {feature} ") 239 | else: 240 | operation = int if self.feats[feature].dtype == DataType.INTEGER else str 241 | self.feats[feature].stats['distribution_bins'] = \ 242 | list(map(operation, anon.deanonymize_list(self.feats[feature].stats['distribution_bins']))) 243 | self.feats[feature].stats['domain'] = \ 244 | list(map(operation, anon.deanonymize_list(self.feats[feature].stats['domain']))) 245 | return self 246 | 247 | def calculate_statistics( 248 | self, 249 | calculator_configs: dict = None 250 | ) -> "DataSchema": 251 | """ Triggers the computation of all statistics for all registered features 252 | of the schema. 253 | 254 | Args: 255 | calculator_configs: Optional configurations for each of the calculator parameters. 256 | This can be either a dict or a "dict of dicts". In the first case, 257 | the statistics for ALL FEATURES will be computed with those parameters. 258 | Additionally, you can specify a mapping of [feature_name: {config}] with 259 | granular configurations per feature. 260 | The supported configuration keys are the attributes declared within a calculator class. 261 | See mercury.contrib.dataschema.calculator.PandasStatCalculator (or Spark) for details. 262 | """ 263 | featnames = list(self.feats.keys()) 264 | 265 | calculator_configs = calculator_configs if calculator_configs else {} 266 | 267 | # User can pass us two types: 268 | # - {'param': 'value', 'param2': 'value'} -> Single config shared for all variables 269 | # - {{config_var1}, {config_var2}, {config_var3}, ...} -> 1 config per variable 270 | multiple_configs = len(calculator_configs) > 0 and isinstance(list(calculator_configs.values())[0], dict) 271 | 272 | # Case when user pass a single shared config 273 | if not multiple_configs: 274 | calculator = StatCalculatorFactory.build_calculator(self.dataframe) 275 | calculator.set_config(**calculator_configs) 276 | 277 | for feature in featnames: 278 | if multiple_configs: 279 | # Case when user pass one config per variable 280 | calculator = StatCalculatorFactory.build_calculator(self.dataframe) 281 | if feature in calculator_configs: 282 | calculator.set_config(**(calculator_configs[feature])) 283 | 284 | # Calculate distributions 285 | self.feats[feature].build_stats(self.dataframe.loc[:, feature], calculator) 286 | 287 | return self 288 | 289 | def _get_threshold(self, dataset_size): 290 | """ Calculates a dynamic threshold for determining whether a variable is categorical 291 | given the dataset. It uses an asymptotic function (whose lim->0) clipped to a maximum value of 1. 292 | """ 293 | return np.minimum(1, 50 / (dataset_size)) 294 | 295 | @property 296 | def continuous_feats(self) -> List[str]: 297 | """ List with the names of all continuous features 298 | """ 299 | return [key for key, feat in self.feats.items() if isinstance(feat, ContinuousFeature)] 300 | 301 | @property 302 | def categorical_feats(self) -> List[str]: 303 | """ List with the names of all categorical features 304 | """ 305 | return [key for key, feat in self.feats.items() if isinstance(feat, CategoricalFeature)] 306 | 307 | @property 308 | def binary_feats(self) -> List[str]: 309 | """ List with the names of all binary features 310 | """ 311 | return [key for key, feat in self.feats.items() if isinstance(feat, BinaryFeature)] 312 | 313 | @property 314 | def discrete_feats(self) -> List[str]: 315 | """ List with the names of all discrete features 316 | """ 317 | return [key for key, feat in self.feats.items() if isinstance(feat, DiscreteFeature)] 318 | 319 | def validate(self, other: "DataSchema"): 320 | """ Validates other schema with this one. The other schema will be considered 321 | valid if it shares the same feature names and datatypes with this. 322 | 323 | Raises RuntimeError if other schema differs from this one 324 | 325 | Args: 326 | other: other schema to be checked from this one 327 | """ 328 | # Check feature names match 329 | if list(self.feats.keys()) != list(other.feats.keys()): 330 | diff = set(self.feats.keys()) - set(other.feats.keys()) 331 | raise RuntimeError(f"Features do not match. These ones are not present on both datasets {list(diff)}") 332 | 333 | # Check feature and data types are the same 334 | for key, item in other.feats.items(): 335 | if not isinstance(item, self.feats[key].__class__): 336 | raise RuntimeError(f"""Feature types do not match. '{key}' in other is """ 337 | f"""{type(item)}. However, {type(self.feats[key])} is expected.""") 338 | 339 | if item.dtype != self.feats[key].dtype: 340 | raise RuntimeError(f"""Data types types do not match. '{key}' in other is """ 341 | f"""{item.dtype}. However, {self.feats[key].dtype} is expected.""") 342 | 343 | def to_json(self) -> dict: 344 | """ Converts the schema to a JSON representation 345 | 346 | Returns: 347 | dictionary with the features and their stats 348 | """ 349 | retdict = dict(feats=dict()) 350 | for key, val in self.feats.items(): 351 | retdict['feats'][key] = self.feats[key].to_json() 352 | 353 | return retdict 354 | 355 | def save(self, path): 356 | """ Saves a JSON with the schema representation 357 | 358 | Args: 359 | path (str): where the JSON will be saved. 360 | """ 361 | with open(path, 'w') as file: 362 | json.dump(self.to_json(), file) 363 | 364 | @classmethod 365 | def load(cls, path: str) -> "DataSchema": 366 | """ Loads a previously serialized schema (as JSON) 367 | 368 | Args: 369 | path: path to the serialized schema 370 | 371 | Returns: 372 | The rebuilt schema 373 | """ 374 | with open(path, 'r') as file: 375 | json_obj = json.load(file) 376 | schema = cls.from_json(json_obj) 377 | return schema 378 | 379 | @classmethod 380 | def from_json(cls, json_obj: dict) -> "DataSchema": 381 | """ Rebuilds an schema from a JSON representation. 382 | 383 | Returns: 384 | The rebuild schema 385 | """ 386 | schema = DataSchema() 387 | factory = FeatureFactory() 388 | 389 | for featname, feat in json_obj['feats'].items(): 390 | ftype = FeatType[feat['feat_type']] 391 | dtype = DataType[feat['dtype']] 392 | feat_name = feat['name'] 393 | dummy_feat = factory._build_dummy_feature(dtype, ftype, feat_name) 394 | dummy_feat.stats = feat['stats'] 395 | schema.feats[featname] = dummy_feat 396 | 397 | return schema 398 | 399 | def get_features_by_type(self, datatype: DataType): 400 | return [key for key, feat in self.feats.items() if feat.dtype == datatype] 401 | --------------------------------------------------------------------------------