├── .gitignore
├── tests
    ├── __init__.py
    └── dataschema
    │   ├── __init__.py
    │   ├── test_calculator.py
    │   ├── test_anonymize.py
    │   └── test_schemagen.py
├── requirements.txt
├── docs
    ├── reference
    │   └── dataschema.md
    └── index.md
├── MANIFEST.in
├── mercury
    └── dataschema
    │   ├── __init__.py
    │   ├── create_tutorials.py
    │   ├── calculator.py
    │   ├── tutorials
    │       └── hello_dataschema.ipynb
    │   ├── anonymize.py
    │   ├── feature.py
    │   └── schemagen.py
├── CHANGELOG.md
├── .bumpversion.cfg
├── .github
    └── workflows
    │   ├── pypi_upload.yml
    │   └── test.yml
├── mkdocs.yml
├── pyproject.toml
├── README.md
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tests/dataschema/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cryptography
2 | numpy
3 | 


--------------------------------------------------------------------------------
/docs/reference/dataschema.md:
--------------------------------------------------------------------------------
1 | # Data Schema
2 | 
3 | ::: mercury.dataschema


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | graft mercury/dataschema/
2 | 
3 | recursive-include mercury/dataschema/tutorials *
4 | 


--------------------------------------------------------------------------------
/mercury/dataschema/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '1.1.2'
2 | 
3 | from .schemagen import DataSchema
4 | from .create_tutorials import create_tutorials
5 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | ## Latest version 1.1.2
2 | 
3 | | Release  | Date | Main feature(s) |
4 | | -------- | ---- | --------------- |
5 | | 1.1.2 | 2025/02/11 | Implements create_tutorials(), adds support for python 3.13, improves documentation. |
6 | 


--------------------------------------------------------------------------------
/.bumpversion.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 1.1.2
 3 | parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(-(?P<release>\w+)\.(?P<build>\d+))?
 4 | serialize =
 5 | 	{major}.{minor}.{patch}-{release}.{build}
 6 | 	{major}.{minor}.{patch}
 7 | commit = True
 8 | tag = True
 9 | 
10 | [bumpversion:file:mercury/dataschema/__init__.py]
11 | 
12 | [bumpversion:file:README.md]
13 | 
14 | [bumpversion:file:docs/index.md]
15 | 
16 | [bumpversion:file:pyproject.toml]
17 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi_upload.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v3
12 |       - name: Set up Python
13 |         uses: actions/setup-python@v4
14 |         with:
15 |           python-version: '3.10'
16 |       - name: Install dependencies
17 |         run: |
18 |           python -m pip install --upgrade pip
19 |           pip install wheel
20 |           pip install build
21 |       - name: Build package
22 |         run: |
23 |           python -m build
24 |       - name: Publish package
25 |         uses: pypa/gh-action-pypi-publish@release/v1
26 |         with:
27 |           user: ${{ secrets.pypi_user }}
28 |           password: ${{ secrets.pypi_password }}
29 |           packages_dir: ./dist/
30 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: mercury-dataschema
 2 | repo_url: https://github.com/BBVA/mercury-dataschema/
 3 | repo_name: mercury-dataschema
 4 | theme:
 5 |   name: material
 6 |   features:
 7 |   - tabs
 8 |   - navigation.indexes
 9 |   icon:
10 |     logo: material/book-open-page-variant
11 |     repo: fontawesome/brands/github
12 | site_dir: site
13 | nav:
14 | - Home: index.md
15 | #- Contributing:
16 | #  - How to contribute: CONTRIBUTING.md
17 | - Api:
18 |   - dataschema: reference/dataschema.md
19 | markdown_extensions:
20 | - codehilite
21 | - admonition
22 | - pymdownx.superfences
23 | - pymdownx.arithmatex:
24 |     generic: true
25 | extra_css:
26 | - stylesheets/extra.css
27 | extra_javascript:
28 | - javascripts/config.js
29 | - https://polyfill.io/v3/polyfill.min.js?features=es6
30 | - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
31 | plugins:
32 | - mkdocstrings:
33 |     handlers:
34 |       python:
35 |         options:
36 |           show_root_heading: true
37 |           show_submodules: true
38 |           merge_init_into_class: true
39 |           docstring_style: google
40 | dev_addr: 0.0.0.0:8080


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "mercury-dataschema"
 7 | license = {file = "LICENSE.txt"}
 8 | version = "1.1.2"
 9 | authors = [
10 |   { name="Mercury Team", email="mercury.group@bbva.com" },
11 | ]
12 | description = "Mercury's DataSchema package allows the automatic recognition and validation of feature types."
13 | readme = "README.md"
14 | requires-python = ">=3.7"
15 | classifiers = [
16 |     "Programming Language :: Python :: 3",
17 |     "License :: OSI Approved :: Apache Software License",
18 |     "Operating System :: OS Independent",
19 | ]
20 | dependencies = [
21 |   'numpy',
22 |   'cryptography'
23 | ]
24 | 
25 | [project.optional-dependencies]
26 | dev = [
27 |   'seaborn',
28 |   'pytest',
29 |   'flake8',
30 | ]
31 | doc = [
32 |   'mkdocs',
33 |   'mkdocstrings[python]',
34 |   'mkdocs-material',
35 |   'mkdocs-minify-plugin==0.5.0',
36 |   'mkdocs-exclude',
37 |   'nbconvert',
38 | ]
39 | 
40 | [project.urls]
41 | "Homepage" = "https://github.com/BBVA/mercury-dataschema"
42 | "Bug Tracker" = "https://github.com/BBVA/mercury-dataschema/issues"
43 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Mercury-Dataschema
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "master", "develop" ]
 6 |   pull_request:
 7 |     branches: [ "master", "develop" ]
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       fail-fast: false
14 |       matrix:
15 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v3
19 |     - name: Set up Python ${{ matrix.python-version }}
20 |       uses: actions/setup-python@v3
21 |       with:
22 |         python-version: ${{ matrix.python-version }}
23 |     - name: Install package
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         python -m pip install flake8 pytest build
27 |         python -m pip install -e .[dev]
28 |     - name: Lint with flake8
29 |       run: |
30 |         # stop the build if there are Python syntax errors or undefined names
31 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --max-line-length=127 --statistics
32 |     - name: Test with pytest
33 |       run: |
34 |         pytest
35 |     - name: Test build
36 |       run: |
37 |         python -m build


--------------------------------------------------------------------------------
/mercury/dataschema/create_tutorials.py:
--------------------------------------------------------------------------------
 1 | import os, pkg_resources, shutil
 2 | 
 3 | 
 4 | def create_tutorials(destination, silent = False):
 5 |     """
 6 |     Copies mercury.dataschema tutorial notebooks to `destination`. A folder will be created inside
 7 |     destination, named 'dataschema_tutorials'. The folder `destination` must exist.
 8 | 
 9 |     Args:
10 |         destination (str): The destination directory
11 |         silent (bool): If True, suppresses output on success.
12 | 
13 |     Raises:
14 |         ValueError: If `destination` is equal to source path.
15 | 
16 |     Examples:
17 |         >>> # copy tutorials to /tmp/dataschema_tutorials
18 |         >>> from mercury.dataschema import create_tutorials
19 |         >>> create_tutorials('/tmp')
20 | 
21 |     """
22 |     src = pkg_resources.resource_filename(__package__, 'tutorials')
23 |     dst = os.path.abspath(destination)
24 | 
25 |     assert src != dst, 'Destination (%s) cannot be the same as source.' % src
26 | 
27 |     assert os.path.isdir(dst), 'Destination (%s) must be a directory.' % dst
28 | 
29 |     dst = os.path.join(dst, 'dataschema_tutorials')
30 | 
31 |     assert not os.path.exists(dst), 'Destination (%s) already exists' % dst
32 | 
33 |     shutil.copytree(src, dst)
34 | 
35 |     if not silent:
36 |         print('Tutorials copied to: %s' % dst)
37 | 


--------------------------------------------------------------------------------
/tests/dataschema/test_calculator.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import pandas as pd
 3 | import numpy as np
 4 | 
 5 | from mercury.dataschema.calculator import StatCalculatorFactory, PandasStatCalculator
 6 | from mercury.dataschema.feature import Feature
 7 | 
 8 | 
 9 | @pytest.fixture(scope='module')
10 | def pandas_df():
11 |     data = [['tom', 10], ['nick', 15], ['juli', 14]]
12 |     return pd.DataFrame(data, columns=['Name', 'Age'])
13 | 
14 | 
15 | def test_calculator_factory(pandas_df):
16 |     assert isinstance(StatCalculatorFactory.build_calculator(pandas_df), PandasStatCalculator)
17 | 
18 | 
19 | def test_calculator(pandas_df):
20 |     calculator = StatCalculatorFactory.build_calculator(pandas_df)
21 | 
22 |     feature = Feature()
23 | 
24 |     calculator.min(pandas_df['Age'], feature)
25 |     calculator.max(pandas_df['Age'], feature)
26 |     calculator.std(pandas_df['Age'], feature)
27 |     calculator.mean(pandas_df['Age'], feature)
28 | 
29 |     assert feature.stats['min'] == 10
30 |     assert feature.stats['max'] == 15
31 |     assert feature.stats['mean'] == 13
32 | 
33 | 
34 | def test_set_config(pandas_df):
35 |     calculator = StatCalculatorFactory.build_calculator(pandas_df)
36 |     with pytest.raises(ValueError):
37 |         calculator.set_config(**{'nonexistingattr': 10})
38 | 
39 |     # assert it assigns the property well
40 |     calculator.set_config(**{'distribution_bins_method': 10})
41 |     assert calculator.distribution_bins_method == 10
42 | 
43 |     # Assert does nothing with None
44 |     calculator.set_config()
45 | 


--------------------------------------------------------------------------------
/tests/dataschema/test_anonymize.py:
--------------------------------------------------------------------------------
 1 | import os, pytest
 2 | 
 3 | from mercury.dataschema.anonymize import Anonymize
 4 | 
 5 | 
 6 | def test_anonymize():
 7 |     os.environ['MERCURY_ANONYMIZE_DATASCHEMA_KEY'] = 'Mickey Mouse'
 8 | 
 9 |     am1 = Anonymize()
10 |     am2 = Anonymize(6*6)
11 | 
12 |     assert am1.hash_key == am2.hash_key
13 | 
14 |     del os.environ['MERCURY_ANONYMIZE_DATASCHEMA_KEY']
15 | 
16 |     an1 = Anonymize(0)
17 |     an2 = Anonymize(20*6)
18 |     an3 = Anonymize(0, True)
19 | 
20 |     assert an1.hash_key == an2.hash_key and an1.hash_key == an3.hash_key and an1.hash_key != am1.hash_key
21 | 
22 |     pl = ['a', 'little', 'bit', 'of text.', 'a', 'ittle', 'bit', 'more.', 'A']
23 | 
24 |     cp_am1 = am1.anonymize_list(pl)
25 | 
26 |     assert [len(s) for s in cp_am1] == [16 for _ in range(9)]
27 |     assert cp_am1[0] == cp_am1[4] and cp_am1[1] != cp_am1[5] and cp_am1[2] == cp_am1[6] and cp_am1[0] != cp_am1[8]
28 | 
29 |     cp_am2 = am2.anonymize_list(pl)
30 | 
31 |     assert [len(s) for s in cp_am2] == [6 for _ in range(9)]
32 |     assert cp_am2[0] == cp_am2[4] and cp_am2[1] != cp_am2[5] and cp_am2[2] == cp_am2[6] and cp_am2[0] != cp_am2[8]
33 |     assert [s.startswith(t) for s, t in zip(cp_am1, cp_am2)] == [True for _ in range(9)]
34 | 
35 |     cp_an1 = an1.anonymize_list(pl)
36 | 
37 |     assert [len(s) > 16 for s in cp_an1] == [True for _ in range(9)]
38 |     assert cp_an1[0] == cp_an1[4] and cp_an1[1] != cp_an1[5] and cp_an1[2] == cp_an1[6] and cp_an1[0] != cp_an1[8]
39 | 
40 |     cp_an2 = an2.anonymize_list(pl)
41 | 
42 |     assert [len(s) for s in cp_an2] == [20 for _ in range(9)]
43 |     assert cp_an2[0] == cp_an2[4] and cp_an2[1] != cp_an2[5] and cp_an2[2] == cp_an2[6] and cp_an2[0] != cp_an2[8]
44 | 
45 |     cp_an3 = an3.anonymize_list(pl)
46 | 
47 |     assert [len(s) > 16 for s in cp_an3] == [True for _ in range(9)]
48 |     assert cp_an3[0] != cp_an3[4] and cp_an3[1] != cp_an3[5] and cp_an3[2] != cp_an3[6] and cp_an3[0] != cp_an3[8]
49 |     assert [len(t) - len(s) for s, t in zip(cp_an1, cp_an3)] == [16 for _ in range(9)]
50 | 
51 |     with pytest.raises(ValueError):
52 |         pl = am1.deanonymize_list(cp_am1)
53 | 
54 |     with pytest.raises(ValueError):
55 |         pl = am2.deanonymize_list(cp_am2)
56 | 
57 |     with pytest.raises(ValueError):
58 |         pl = an2.deanonymize_list(cp_an2)
59 | 
60 |     pl1 = an1.deanonymize_list(cp_an1)
61 | 
62 |     assert pl1 == pl
63 | 
64 |     pl3 = an3.deanonymize_list(cp_an3)
65 | 
66 |     assert pl3 == pl
67 | 
68 |     bm1 = Anonymize()
69 | 
70 |     assert am1.hash_key != bm1.hash_key
71 | 
72 |     cp_bm1 = bm1.anonymize_list(pl)
73 | 
74 |     assert cp_am1 != cp_bm1
75 | 
76 |     bm1.set_key('Mickey Mouse')
77 | 
78 |     assert am1.hash_key == bm1.hash_key
79 | 
80 |     cp_bm1 = bm1.anonymize_list(pl)
81 | 
82 |     assert cp_am1 == cp_bm1
83 | 


--------------------------------------------------------------------------------
/mercury/dataschema/calculator.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from typing import Union
  3 | 
  4 | from .feature import BinaryFeature, CategoricalFeature
  5 | 
  6 | 
  7 | class FeatureCalculator():
  8 |     """ This is a base class with the operation definitions. Several classes must
  9 |     extend this, implementing its operations for each one of the supported frameworks
 10 |     (namely Pandas and Pyspark)
 11 |     """
 12 | 
 13 |     def __init(self):
 14 |         pass
 15 | 
 16 |     def min(self, column, feature):
 17 |         pass
 18 | 
 19 |     def max(self, column, feature):
 20 |         pass
 21 | 
 22 |     def distribution(self, column, feature, bins=None):
 23 |         pass
 24 | 
 25 |     @property
 26 |     def _registered_params(self):
 27 |         return list(self.__dict__.keys())
 28 | 
 29 |     def set_config(self, **kwargs):
 30 |         """ Set attributes with the keys of the dictionary. These can be later used within
 31 |         specific calculator methods (like `distribution()` for specifying the number of bins).
 32 | 
 33 |         For this to work, the parameter must have been explicitly declared during object's
 34 |         constructor. That is, you cannot pass here a parameter name which the calculator doesn't
 35 |         support (or this will raise a ValueError).
 36 | 
 37 |         Args:
 38 |             **kwargs (dict): The names and values of the desired parameters to set.
 39 | 
 40 |         Raises ValueError if any keyword argument does not exist among the existing attributes of
 41 |         the object.
 42 |         """
 43 |         if kwargs is None:
 44 |             return
 45 | 
 46 |         for key, val in kwargs.items():
 47 |             if not hasattr(self, key):
 48 |                 raise ValueError(
 49 |                     f"Error. This calculator doesn't support the `{key}` parameter. Available options are {self._registered_params}"
 50 |                 )
 51 |             setattr(self, key, val)
 52 | 
 53 | 
 54 | class PandasStatCalculator(FeatureCalculator):
 55 |     """ Implementation of a Calculator for Pandas
 56 | 
 57 |     Supported setting keys are the following:
 58 | 
 59 |         - `distribution_bins_method`: The method for setting the number of bins when
 60 |           calling the `distribution` method. Note that this only has effect when feature is
 61 |           either discrete or continuous.
 62 |         - `limit_categorical_perc`: The method for truncating categorical variables with
 63 |            high cardinality
 64 |     """
 65 |     def __init__(self):
 66 |         super().__init__()
 67 |         self.distribution_bins_method = 'sqrt'
 68 |         self.limit_categorical_perc = None
 69 | 
 70 |     def min(self, column, feature):
 71 |         feature.stats['min'] = float(column.min())
 72 | 
 73 |     def max(self, column, feature):
 74 |         feature.stats['max'] = float(column.max())
 75 | 
 76 |     def distribution(self, column, feature, bins=None):
 77 |         """ Calculates the histogram for a given feature.
 78 | 
 79 |         Args:
 80 |             column (pd.Series): Pandas column with the data
 81 |             feature (Feature): Feature which holds the metadata
 82 |             bins (Union[int, str, None]): (Only used for numerical features) If a number, the histogram will
 83 |                   have `bins` bins. If a string, it will use an automatic NumPy method for
 84 |                   estimating this number. See more about available methods here:
 85 |                   https://numpy.org/devdocs/reference/generated/numpy.histogram_bin_edges.html#numpy.histogram_bin_edges.
 86 |                   If None is provided, it uses the default class' method, which is `sqrt`.
 87 |                   For binary features it simply uses bins=2 and for categoricals, bins=|categories| if is not limited
 88 |                   with 'limit_categorical_perc' in set_config method.
 89 |         """
 90 |         if 'no_nan_filtered' not in feature.cache:
 91 |             no_na = column.dropna()
 92 |             feature.cache['no_nan_filtered'] = no_na
 93 |         else:
 94 |             no_na = feature.cache['no_nan_filtered']
 95 | 
 96 |         if isinstance(feature, (BinaryFeature, CategoricalFeature)):
 97 | 
 98 |             no_na = no_na[no_na.isin(feature.stats['domain'])]  # It may be truncated
 99 |             t = (no_na.value_counts() / len(no_na)).sort_index()
100 |             feature.stats['distribution'] = t.values
101 |             feature.stats['distribution'] = [float(x) for x in feature.stats['distribution']]
102 |             feature.stats['distribution_bins'] = list(t.index)
103 | 
104 |         else:
105 |             bins = self.distribution_bins_method if not bins else bins
106 |             histo = np.histogram(no_na, bins=bins)
107 |             feature.stats['distribution'] = list(histo[0] / no_na.count())
108 |             feature.stats['distribution'] = [float(x) for x in feature.stats['distribution']]
109 |             feature.stats['distribution_bins'] = list(histo[1])
110 | 
111 |     def std(self, column, feature):
112 |         feature.stats['std'] = column.std()
113 | 
114 |     def mean(self, column, feature):
115 |         feature.stats['mean'] = column.mean()
116 | 
117 | 
118 | class SparkStatCalculator(FeatureCalculator):
119 |     def __init__(self):
120 |         pass
121 | 
122 | 
123 | class StatCalculatorFactory:
124 |     """ This static class receives a DataFrame and returns a particular implementation
125 |     of a FeatureCalculator
126 |     """
127 |     @classmethod
128 |     def build_calculator(
129 |         cls,
130 |         dataframe: Union["pandas.DataFrame", "pyspark.sql.DataFrame"]  # noqa: F821
131 |     ) -> FeatureCalculator:
132 | 
133 |         if "pyspark" in str(type(dataframe)):
134 |             raise RuntimeError("Sorry, Pyspark is not supported yet...")
135 | 
136 |         return PandasStatCalculator()
137 | 


--------------------------------------------------------------------------------
/mercury/dataschema/tutorials/hello_dataschema.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "475f1494-94c6-4a6d-bfc7-e4c72809b0be",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# A simple hands-on mercury-dataschema"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "id": "e10b0892-b219-42aa-8d8b-21ceb77dcaa4",
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from mercury.dataschema import DataSchema\n",
 19 |     "from mercury.dataschema.anonymize import Anonymize\n"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "id": "f4ccb99f-57ac-4360-85e3-c8a2ff9ea8dd",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "## Getting a dataset from seaborn examples\n",
 28 |     "\n",
 29 |     "We import seaborn just in case just to load the tips dataset to play with it.\n",
 30 |     "\n",
 31 |     "We pip install it first."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "id": "08e37205-414e-4871-a594-33aab0efee03",
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "!pip install seaborn"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "id": "3e83f082-4434-4994-9229-ea517b20968f",
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "import seaborn as sns\n"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "id": "ed45555b-a0ba-4d05-a42a-596efaa25d84",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "We change the types of the strings to string."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "id": "0e956955-4edc-4ca4-9944-e819fde35884",
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "tips = sns.load_dataset('tips')\n",
 70 |     "tips['sex'] = tips['sex'].astype(str)\n",
 71 |     "tips['smoker'] = tips['smoker'].astype(str)\n",
 72 |     "tips['day'] = tips['day'].astype(str)\n",
 73 |     "tips['time'] = tips['time'].astype(str)\n",
 74 |     "\n",
 75 |     "tips\n"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "id": "d1370edb-5fc7-4c09-a604-37c6f4b54cfe",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "## Automated type detection"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "id": "1e5279db-e38e-4331-b098-7f8514880122",
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "schema = DataSchema().generate(tips)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "id": "b53ff5e4-31de-42eb-accc-29674a2af640",
 99 |    "metadata": {},
100 |    "source": [
101 |     "The method `.generate` generates for each of the columns an object of class Feature that allows abstracting its details\n",
102 |     "and using it in the same way across types.\n",
103 |     "\n",
104 |     "This is how many mercury packages work.\n",
105 |     "\n",
106 |     "As you can see in the previous warning, it treats an integer variable as categorical because it has only two values. This behavior can be controlled \n",
107 |     "\n",
108 |     "  * [see documentation](https://bbva.github.io/mercury-dataschema/)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "id": "0e50bfcd-a116-41e8-bcf2-0a6a7c99fb38",
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "schema.feats"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "id": "cdaaddc0-b022-422c-8865-82acca7e467d",
124 |    "metadata": {},
125 |    "source": [
126 |     "## Anonymize example\n",
127 |     "\n",
128 |     "The pckage also includes an Anonymize class that supports multiple key management functions, controlable precision and secure cryptography."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "id": "151e4e70-0138-4bd5-9e86-415f46853a9f",
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "anon = Anonymize()"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "id": "ed9a1d2a-dcb6-460b-af3f-90638a0cc567",
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "anon.set_key('Mickey Mouse')"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "id": "26159c27-6e27-4185-aa40-207372c539c5",
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "anon.anonymize_list_any_type(list(tips['total_bill']))[0:10]"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "id": "413441db-580c-49c4-8864-fa452dda59b2",
164 |    "metadata": {},
165 |    "source": [
166 |     "## Same example with shorter digest length\n",
167 |     "\n",
168 |     "We run the same example with 12 bit digest (2 base-64 digits)."
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "id": "4e7bb476-cf2d-40d4-b9bb-b2c94cebe5e4",
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "anon = Anonymize(digest_bits = 12)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "id": "388ccda3-e723-4dfe-b0ce-70a7c722aec0",
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "anon.set_key('Mickey Mouse')"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "id": "29a7aa87-dfda-4f5d-a5f4-b8809bd3ff54",
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "anon.anonymize_list_any_type(list(tips['total_bill']))[0:10]"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "id": "27f406f7-6e32-4b42-9089-79dce4484a26",
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": []
208 |   }
209 |  ],
210 |  "metadata": {
211 |   "kernelspec": {
212 |    "display_name": "Python 3 (ipykernel)",
213 |    "language": "python",
214 |    "name": "python3"
215 |   },
216 |   "language_info": {
217 |    "codemirror_mode": {
218 |     "name": "ipython",
219 |     "version": 3
220 |    },
221 |    "file_extension": ".py",
222 |    "mimetype": "text/x-python",
223 |    "name": "python",
224 |    "nbconvert_exporter": "python",
225 |    "pygments_lexer": "ipython3",
226 |    "version": "3.13.0"
227 |   }
228 |  },
229 |  "nbformat": 4,
230 |  "nbformat_minor": 5
231 | }
232 | 


--------------------------------------------------------------------------------
/mercury/dataschema/anonymize.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from cryptography.hazmat.primitives.ciphers.aead import AESGCM
  4 | from cryptography.hazmat.primitives import hashes
  5 | 
  6 | import base64
  7 | import math
  8 | 
  9 | 
 10 | class Anonymize:
 11 | 
 12 |     """Cryptographically secure anonymization.
 13 | 
 14 |     This class encrypts or hashes lists of strings using cryptographically secure standardized algorithms.
 15 |     It can be used with a user defined key or without a key in which case it will produce identical hashes
 16 |     across different platforms.
 17 | 
 18 |     The key can be given at construction time by setting the environment variable MERCURY_ANONYMIZE_DATASCHEMA_KEY
 19 |     or at any later time by calling the .set_key() method.
 20 | 
 21 |     Args:
 22 |         digest_bits (int):  This determines the length in (effective) bits of the output hash. As it is encoded in base64,
 23 |             the number of characters will be 1/6 times this number. E.g., 96 (the default) produces 16
 24 |             char long hashes. If this is set to a value other than zero, the output length is fixed, the
 25 |             output is irreversible (cannot be used with .deanonymize_list()) and the algorithm used for
 26 |             hashing is keyed BLAKE2 (https://www.blake2.net/).
 27 |             If this is set to zero, you will get a variable length secure encryption using Galois/Counter
 28 |             Mode AES. (see the argument `safe_crypto`) and the result can be deanonymized with the same key
 29 |             using .deanonymize_list().
 30 |         safe_crypto (bool):  This argument selects how the encryption is randomized. If True, the same original text with
 31 |             the same key produces different encrypted texts each time. Note that this will change the
 32 |             cardinality of the set of values to the length of the list.
 33 |             If false (the default) the same text will produce the same output with the same key. This
 34 |             preserves cardinality, but can be a target of attacks when the attacker has access to
 35 |             encoded pairs.
 36 |     """
 37 | 
 38 |     def __init__(self, digest_bits=96, safe_crypto=False):
 39 |         self.digest_bits = digest_bits
 40 |         self.safe_crypto = safe_crypto
 41 | 
 42 |         plain_key = os.environ.get('MERCURY_ANONYMIZE_DATASCHEMA_KEY')
 43 |         plain_key = '<void>' if plain_key is None else plain_key
 44 | 
 45 |         hash_key = hashes.Hash(hashes.BLAKE2s(32))
 46 | 
 47 |         hash_key.update(plain_key.encode('utf-8'))
 48 | 
 49 |         self.hash_key = hash_key.finalize()[0:16]
 50 | 
 51 |     def set_key(self, encryption_key):
 52 |         """Set the encryption key of an existing `Anonymize` object.
 53 | 
 54 |         This changes the encryption key overriding the key possibly defined using the environment variable
 55 |         MERCURY_ANONYMIZE_DATASCHEMA_KEY at construction. It can be called any number of times.
 56 | 
 57 |         Args:
 58 |             encryption_key (list):  The key as a string.
 59 |         """
 60 |         hash_key = hashes.Hash(hashes.BLAKE2s(32))
 61 | 
 62 |         hash_key.update(encryption_key.encode('utf-8'))
 63 | 
 64 |         self.hash_key = hash_key.finalize()[0:16]
 65 | 
 66 |     def anonymize_list(self, list_of_str):
 67 |         """Anonymize a list of strings.
 68 | 
 69 |         This hashes or encrypts a list of strings. The precise function is defined at object construction.
 70 |         (See the doc of the class `Anonymize` for details.)
 71 | 
 72 |         Args:
 73 |             list_of_str (list):  A list of strings to be anonymized.
 74 | 
 75 |         Returns (list):
 76 |             The anonymized list of strings encoded in base64.
 77 |         """
 78 |         l2 = list()
 79 | 
 80 |         if self.digest_bits != 0:
 81 |             digest_len = math.ceil(self.digest_bits / 6)
 82 | 
 83 |             for s in list_of_str:
 84 |                 hash = hashes.Hash(hashes.BLAKE2b(64))
 85 |                 hash.update(self.hash_key)
 86 |                 hash.update(s.encode('utf-8'))
 87 | 
 88 |                 l2.append(base64.encodebytes(hash.finalize()).decode()[0:digest_len])
 89 |         else:
 90 |             aes = AESGCM(self.hash_key)
 91 | 
 92 |             if self.safe_crypto:
 93 |                 for s in list_of_str:
 94 |                     nonce = os.urandom(12)		# Must be >8 (min requirement) and multiple of 6 (fixed length in)
 95 |                     cipher = aes.encrypt(nonce, s.encode('utf-8'), None)
 96 | 
 97 |                     l2.append(base64.encodebytes(nonce + cipher).decode())
 98 |             else:
 99 |                 nonce = b'12345678'
100 |                 for s in list_of_str:
101 |                     cipher = aes.encrypt(nonce, s.encode('utf-8'), None)
102 | 
103 |                     l2.append(base64.encodebytes(cipher).decode())
104 | 
105 |         return l2
106 | 
107 |     def anonymize_list_any_type(self, list_of_any):
108 |         """Anonymize a list of anything that supports conversion to string.
109 | 
110 |         This is a wrapper function over anonymize_list(). It verifies if any element in the list is
111 |         not a string first. If all elements are strings, it passes the list to anonymize_list().
112 |         Otherwise, it creates a new list of string elements and passes that to anonymize_list().
113 | 
114 |         Args:
115 |             list_of_any (list):  A list of any data type that supports string conversion via str() to be anonymized.
116 | 
117 |         Returns (list):
118 |             The anonymized list of strings encoded in base64.
119 |         """
120 | 
121 |         assert type(list_of_any) == list
122 | 
123 |         all_str = True
124 |         for s in list_of_any:
125 |             if type(s) != str:
126 |                 all_str = False
127 |                 break
128 | 
129 |         if all_str:
130 |             return self.anonymize_list(list_of_any)
131 | 
132 |         return self.anonymize_list([str(e) for e in list_of_any])
133 | 
134 |     def deanonymize_list(self, list_of_str):
135 |         """Deanonymize a list of strings.
136 | 
137 |         Deanonymizes a list of anonymized strings recovering the original text. This can only be applied if
138 |         the encryption is reversible (The object was created with `digest_bits = 0`) and the key is the same
139 |         key used for encryption.
140 | 
141 |         Raises ValueError when called on an object that does hashing (is created with `digest_bits > 0`)
142 |         rather than encryption.
143 | 
144 | 
145 |         Args:
146 |             list_of_str (list):  A list of strings anonymized using a previous .anonymize_list() call.
147 | 
148 |         Returns (list):
149 |             The original deanonymized list of strings.
150 |         """
151 |         if self.digest_bits != 0:
152 |             raise ValueError("deanonymize_list() requires passing 'digest_bits = 0' to the constructor.")
153 | 
154 |         l2 = list()
155 | 
156 |         aes = AESGCM(self.hash_key)
157 | 
158 |         if self.safe_crypto:
159 |             for s in list_of_str:
160 |                 raw = base64.decodebytes(s.encode())
161 |                 nonce = raw[0:12]
162 |                 cipher = raw[12:]
163 | 
164 |                 l2.append(aes.decrypt(nonce, cipher, None).decode('utf-8'))
165 |         else:
166 |             nonce = b'12345678'
167 |             for s in list_of_str:
168 |                 cipher = base64.decodebytes(s.encode())
169 | 
170 |                 l2.append(aes.decrypt(nonce, cipher, None).decode('utf-8'))
171 | 
172 |         return l2
173 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | # mercury-dataschema
  2 | 
  3 | [![](https://github.com/BBVA/mercury-dataschema/actions/workflows/test.yml/badge.svg)](https://github.com/BBVA/mercury-dataschema)
  4 | ![](https://img.shields.io/badge/latest-1.1.2-blue)
  5 | 
  6 | `mercury-dataschema` is a submodule of the Mercury library which acts as a utility tool that, given a Pandas DataFrame, its `DataSchema` class auto-infers feature types and automatically calculates different statistics depending on them.
  7 | 
  8 | This type inference isn't solely based on data types but in the information the variables contain. For example: if a feature is encoded as a `float` but its cardinality is 2, we can be sure it's a binary feature.
  9 | 
 10 | This package is used by other Mercury submodules, and you also can use it separately from the rest of the library.
 11 | 
 12 | As an idea (there are plenty of them, though), it is particularly useful when preprocessing datasets. Having to specify the typical `categorical_cols` and `coninuous_cols` is over!
 13 | 
 14 | ## Mercury project at BBVA
 15 | 
 16 | Mercury is a collaborative library that was developed by the Advanced Analytics community at BBVA. Originally, it was created as an [InnerSource](https://en.wikipedia.org/wiki/Inner_source) project but after some time, we decided to release certain parts of the project as Open Source.
 17 | That's the case with the `mercury-dataschema` package.
 18 | 
 19 | If you're interested in learning more about the Mercury project, we recommend reading this blog [post](https://www.bbvaaifactory.com/mercury-acelerando-la-reutilizacion-en-ciencia-de-datos-dentro-de-bbva/) from www.bbvaaifactory.com
 20 | 
 21 | ## User installation
 22 | 
 23 | The easiest way to install `mercury-dataschema` is using ``pip``:
 24 | 
 25 |     pip install -U mercury-dataschema
 26 | 
 27 | ## Example
 28 | 
 29 | ```python
 30 | from mercury.dataschema.schemagen import DataSchema
 31 | from mercury.dataschema.feature import FeatType
 32 | 
 33 | dataset = UCIDataset().load()   # Any Dataframe
 34 | 
 35 | schma = (DataSchema()         # Generate a lazy Schema object
 36 |     .generate(dataset)        # Manually trigger its construction (it mostly infers data types...)
 37 |     .calculate_statistics())  # Manually trigger extra statistic calculations for each feature
 38 | ```
 39 | 
 40 | Then, we can inspect all the features with
 41 | 
 42 | ```python
 43 | schma.feats
 44 | ```
 45 | 
 46 | ```
 47 | {'ID': Discrete Feature (NAME=None, dtype=DataType.INTEGER),
 48 |  'LIMIT_BAL': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 49 |  'SEX': Binary Feature (NAME=None, dtype=DataType.INTEGER),
 50 |  'EDUCATION': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
 51 |  'MARRIAGE': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
 52 |  'AGE': Discrete Feature (NAME=None, dtype=DataType.INTEGER),
 53 |  'PAY_0': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
 54 |  'PAY_2': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
 55 |  'PAY_3': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
 56 |  'PAY_4': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
 57 |  'PAY_5': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
 58 |  'PAY_6': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
 59 |  'BILL_AMT1': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 60 |  'BILL_AMT2': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 61 |  'BILL_AMT3': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 62 |  'BILL_AMT4': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 63 |  'BILL_AMT5': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 64 |  'BILL_AMT6': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 65 |  'PAY_AMT1': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 66 |  'PAY_AMT2': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 67 |  'PAY_AMT3': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 68 |  'PAY_AMT4': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 69 |  'PAY_AMT5': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 70 |  'PAY_AMT6': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 71 |  'default.payment.next.month': Binary Feature (NAME=None, dtype=DataType.INTEGER)}
 72 | ```
 73 | 
 74 | And we can get extra feature statistics by inspecting the .stats attribute of the `Feature` objects.
 75 | 
 76 | ```python
 77 | schma.feats['BILL_AMT4'].stats
 78 | ```
 79 | 
 80 | ```
 81 | {'num_nan': 0,
 82 |  'percent_nan': 0.0,
 83 |  'samples': 30000,
 84 |  'percent_unique': 0.7182666666666667,
 85 |  'cardinality': 21548,
 86 |  'min': -170000.0,
 87 |  'max': 891586.0,
 88 |  'distribution': [3.3333333333333335e-05,
 89 |   0.0,
 90 |   3.3333333333333335e-05,
 91 |   0.0,
 92 |   0.0,
 93 |   3.3333333333333335e-05,
 94 |   0.0,
 95 |   3.3333333333333335e-05,
 96 |   3.3333333333333335e-05,
 97 |   0.0,
 98 |   3.3333333333333335e-05,
 99 |   6.666666666666667e-05,
100 |   6.666666666666667e-05,
101 |   0.00016666666666666666,
102 |   ...,
103 |   0.0,
104 |   0.0,
105 |   0.0,
106 |   0.0,
107 |   0.0,
108 |   3.3333333333333335e-05],
109 |  'distribution_bins': [-170000.0,
110 |   -163898.93103448275,
111 |   -157797.8620689655,
112 |   -151696.7931034483,
113 |   ...,
114 |   867181.724137931,
115 |   873282.7931034482,
116 |   879383.8620689653,
117 |   885484.9310344828,
118 |   891586.0]}
119 | ```
120 | 
121 | ```python
122 | schma.feats
123 | ```
124 | 
125 | ```
126 | {'ID': Discrete Feature (NAME=None, dtype=DataType.INTEGER),
127 |  'LIMIT_BAL': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
128 |  'SEX': Binary Feature (NAME=None, dtype=DataType.INTEGER),
129 |  'EDUCATION': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
130 |  'MARRIAGE': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
131 |  'AGE': Discrete Feature (NAME=None, dtype=DataType.INTEGER),
132 |  'PAY_0': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
133 |  'PAY_2': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
134 |  'PAY_3': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
135 |  'PAY_4': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
136 |  'PAY_5': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
137 |  'PAY_6': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
138 |  'BILL_AMT1': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
139 |  'BILL_AMT2': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
140 |  'BILL_AMT3': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
141 |  'BILL_AMT4': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
142 |  'BILL_AMT5': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
143 |  'BILL_AMT6': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
144 |  'PAY_AMT1': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
145 |  'PAY_AMT2': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
146 |  'PAY_AMT3': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
147 |  'PAY_AMT4': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
148 |  'PAY_AMT5': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
149 |  'PAY_AMT6': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
150 |  'default.payment.next.month': Binary Feature (NAME=None, dtype=DataType.INTEGER)}
151 | ```
152 | 
153 | Note how for different features, the computed statistics vary:
154 | 
155 | ```python
156 | schma.feats['default.payment.next.month'].stats
157 | ```
158 | 
159 | ```
160 | {'num_nan': 0,
161 |  'percent_nan': 0.0,
162 |  'samples': 30000,
163 |  'percent_unique': 6.666666666666667e-05,
164 |  'cardinality': 2,
165 |  'distribution': [0.7788, 0.2212],
166 |  'distribution_bins': [0, 1],
167 |  'domain': [1, 0]}
168 | ```
169 | 
170 | ## Saving and loading schemas
171 | 
172 | You can serialize and reload `DataSchema`s so you can reuse them in the future.
173 | 
174 | ```python
175 | PATH = 'schma.json'
176 | # Save the schema
177 | schma.save(PATH)
178 | 
179 | # Load it back!
180 | recovered = DataSchema.load(PATH)
181 | ```
182 | 
183 | ## Help and support
184 | 
185 | This library is currently maintained by a dedicated team of data scientists and machine learning engineers from BBVA AI Factory.
186 | 
187 | ### Documentation
188 | website: https://bbva.github.io/mercury-dataschema/site/
189 | 
190 | ### Email
191 | mercury.group@bbva.com
192 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # mercury-dataschema
  2 | 
  3 | [![](https://github.com/BBVA/mercury-dataschema/actions/workflows/test.yml/badge.svg)](https://github.com/BBVA/mercury-dataschema)
  4 | ![](https://img.shields.io/badge/latest-1.1.2-blue)
  5 | [![Python 3.8](https://img.shields.io/badge/python-3.8-blue.svg)](https://www.python.org/downloads/release/python-3816/)
  6 | [![Python 3.9](https://img.shields.io/badge/python-3.9-blue.svg)](https://www.python.org/downloads/release/python-3916/)
  7 | [![Python 3.10](https://img.shields.io/badge/python-3.10-blue.svg)](https://www.python.org/downloads/release/python-31011/)
  8 | [![Python 3.11](https://img.shields.io/badge/python-3.11-blue.svg)](https://www.python.org/downloads/release/python-3119/)
  9 | [![Python 3.12](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/release/python-3128/)
 10 | [![Python 3.13](https://img.shields.io/badge/python-3.13-blue.svg)](https://www.python.org/downloads/release/python-3131/)
 11 | [![Apache 2 license](https://shields.io/badge/license-Apache%202-blue)](http://www.apache.org/licenses/LICENSE-2.0)
 12 | [![Ask Me Anything !](https://img.shields.io/badge/Ask%20me-anything-1abc9c.svg)](https://github.com/BBVA/mercury-dataschema/issues)
 13 | 
 14 | `mercury-dataschema` is a submodule of the Mercury library which acts as a utility tool that, given a Pandas DataFrame, its `DataSchema` class auto-infers feature types and automatically calculates different statistics depending on them.
 15 | 
 16 | This type inference isn't solely based on data types but in the information the variables contain. For example: if a feature is encoded as a `float` but its cardinality is 2, we can be sure it's a binary feature.
 17 | 
 18 | This package is used by other Mercury submodules, and you also can use it separately from the rest of the library.
 19 | 
 20 | As an idea (there are plenty of them, though), it is particularly useful when preprocessing datasets. Having to specify the typical `categorical_cols` and `coninuous_cols` is over!
 21 | 
 22 | ## Mercury project at BBVA
 23 | 
 24 | Mercury is a collaborative library that was developed by the Advanced Analytics community at BBVA. Originally, it was created as an [InnerSource](https://en.wikipedia.org/wiki/Inner_source) project but after some time, we decided to release certain parts of the project as Open Source.
 25 | That's the case with the `mercury-dataschema` package.
 26 | 
 27 | If you're interested in learning more about the Mercury project, we recommend reading this blog [post](https://www.bbvaaifactory.com/mercury-acelerando-la-reutilizacion-en-ciencia-de-datos-dentro-de-bbva/) from www.bbvaaifactory.com
 28 | 
 29 | ## User installation
 30 | 
 31 | The easiest way to install `mercury-dataschema` is using ``pip``:
 32 | 
 33 |     pip install -U mercury-dataschema
 34 | 
 35 | ## Example
 36 | 
 37 | ```python
 38 | from mercury.dataschema.schemagen import DataSchema
 39 | from mercury.dataschema.feature import FeatType
 40 | 
 41 | dataset = UCIDataset().load()   # Any Dataframe
 42 | 
 43 | schma = (DataSchema()         # Generate a lazy Schema object
 44 |     .generate(dataset)        # Manually trigger its construction (it mostly infers data types...)
 45 |     .calculate_statistics())  # Manually trigger extra statistic calculations for each feature
 46 | ```
 47 | 
 48 | Then, we can inspect all the features with
 49 | 
 50 | ```python
 51 | schma.feats
 52 | ```
 53 | 
 54 | ```
 55 | {'ID': Discrete Feature (NAME=None, dtype=DataType.INTEGER),
 56 |  'LIMIT_BAL': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 57 |  'SEX': Binary Feature (NAME=None, dtype=DataType.INTEGER),
 58 |  'EDUCATION': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
 59 |  'MARRIAGE': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
 60 |  'AGE': Discrete Feature (NAME=None, dtype=DataType.INTEGER),
 61 |  'PAY_0': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
 62 |  'PAY_2': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
 63 |  'PAY_3': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
 64 |  'PAY_4': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
 65 |  'PAY_5': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
 66 |  'PAY_6': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
 67 |  'BILL_AMT1': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 68 |  'BILL_AMT2': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 69 |  'BILL_AMT3': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 70 |  'BILL_AMT4': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 71 |  'BILL_AMT5': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 72 |  'BILL_AMT6': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 73 |  'PAY_AMT1': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 74 |  'PAY_AMT2': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 75 |  'PAY_AMT3': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 76 |  'PAY_AMT4': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 77 |  'PAY_AMT5': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 78 |  'PAY_AMT6': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
 79 |  'default.payment.next.month': Binary Feature (NAME=None, dtype=DataType.INTEGER)}
 80 | ```
 81 | 
 82 | And we can get extra feature statistics by inspecting the .stats attribute of the `Feature` objects.
 83 | 
 84 | ```python
 85 | schma.feats['BILL_AMT4'].stats
 86 | ```
 87 | 
 88 | ```
 89 | {'num_nan': 0,
 90 |  'percent_nan': 0.0,
 91 |  'samples': 30000,
 92 |  'percent_unique': 0.7182666666666667,
 93 |  'cardinality': 21548,
 94 |  'min': -170000.0,
 95 |  'max': 891586.0,
 96 |  'distribution': [3.3333333333333335e-05,
 97 |   0.0,
 98 |   3.3333333333333335e-05,
 99 |   0.0,
100 |   0.0,
101 |   3.3333333333333335e-05,
102 |   0.0,
103 |   3.3333333333333335e-05,
104 |   3.3333333333333335e-05,
105 |   0.0,
106 |   3.3333333333333335e-05,
107 |   6.666666666666667e-05,
108 |   6.666666666666667e-05,
109 |   0.00016666666666666666,
110 |   ...,
111 |   0.0,
112 |   0.0,
113 |   0.0,
114 |   0.0,
115 |   0.0,
116 |   3.3333333333333335e-05],
117 |  'distribution_bins': [-170000.0,
118 |   -163898.93103448275,
119 |   -157797.8620689655,
120 |   -151696.7931034483,
121 |   ...,
122 |   867181.724137931,
123 |   873282.7931034482,
124 |   879383.8620689653,
125 |   885484.9310344828,
126 |   891586.0]}
127 | ```
128 | 
129 | ```python
130 | schma.feats
131 | ```
132 | 
133 | ```
134 | {'ID': Discrete Feature (NAME=None, dtype=DataType.INTEGER),
135 |  'LIMIT_BAL': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
136 |  'SEX': Binary Feature (NAME=None, dtype=DataType.INTEGER),
137 |  'EDUCATION': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
138 |  'MARRIAGE': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
139 |  'AGE': Discrete Feature (NAME=None, dtype=DataType.INTEGER),
140 |  'PAY_0': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
141 |  'PAY_2': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
142 |  'PAY_3': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
143 |  'PAY_4': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
144 |  'PAY_5': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
145 |  'PAY_6': Categorical Feature (NAME=None, dtype=DataType.INTEGER),
146 |  'BILL_AMT1': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
147 |  'BILL_AMT2': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
148 |  'BILL_AMT3': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
149 |  'BILL_AMT4': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
150 |  'BILL_AMT5': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
151 |  'BILL_AMT6': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
152 |  'PAY_AMT1': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
153 |  'PAY_AMT2': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
154 |  'PAY_AMT3': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
155 |  'PAY_AMT4': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
156 |  'PAY_AMT5': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
157 |  'PAY_AMT6': Discrete Feature (NAME=None, dtype=DataType.FLOAT),
158 |  'default.payment.next.month': Binary Feature (NAME=None, dtype=DataType.INTEGER)}
159 | ```
160 | 
161 | Note how for different features, the computed statistics vary:
162 | 
163 | ```python
164 | schma.feats['default.payment.next.month'].stats
165 | ```
166 | 
167 | ```
168 | {'num_nan': 0,
169 |  'percent_nan': 0.0,
170 |  'samples': 30000,
171 |  'percent_unique': 6.666666666666667e-05,
172 |  'cardinality': 2,
173 |  'distribution': [0.7788, 0.2212],
174 |  'distribution_bins': [0, 1],
175 |  'domain': [1, 0]}
176 | ```
177 | 
178 | ## Example notebooks
179 | 
180 | ```python
181 | from mercury.dataschema import create_tutorials
182 | 
183 | create_tutorials('.')	# Creates a folder with example notebooks in the current path.
184 | ```
185 | 
186 | ## Saving and loading schemas
187 | 
188 | You can serialize and reload `DataSchema`s so you can reuse them in the future.
189 | 
190 | ```python
191 | PATH = 'schma.json'
192 | # Save the schema
193 | schma.save(PATH)
194 | 
195 | # Load it back!
196 | recovered = DataSchema.load(PATH)
197 | ```
198 | 
199 | ## Help and support
200 | 
201 | This library is currently maintained by a dedicated team of data scientists and machine learning engineers from BBVA.
202 | 
203 | ### Documentation
204 | website: https://bbva.github.io/mercury-dataschema/site/
205 | 
206 | ### Email
207 | mercury.group@bbva.com
208 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/mercury/dataschema/feature.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | 
  3 | import numpy as np
  4 | import warnings
  5 | 
  6 | 
  7 | class DataType(Enum):
  8 |     INTEGER = 'INTEGER'
  9 |     FLOAT = 'FLOAT'
 10 |     STRING = 'STRING'
 11 |     DATE = 'DATE'
 12 |     BOOL = 'BOOL'
 13 |     CATEGORICAL = 'CATEGORICAL'  # for pandas categorical type
 14 |     UNKNOWN = 'UNKNOWN'
 15 | 
 16 | 
 17 | class FeatType(Enum):
 18 |     BINARY = 'BINARY'
 19 |     CATEGORICAL = 'CATEGORICAL'
 20 |     DISCRETE = 'DISCRETE'
 21 |     CONTINUOUS = 'CONTINUOUS'
 22 |     UNKNOWN = 'UNKNOWN'
 23 | 
 24 | 
 25 | class Feature:
 26 |     """ This class represents a generic feature within a schema.
 27 | 
 28 |     Args:
 29 |         name: Feature name
 30 |         dtype: Data type of the feature
 31 |     """
 32 |     def __init__(self,
 33 |                  name: str = None,
 34 |                  dtype: DataType = None
 35 |                  ):
 36 |         self.name = name
 37 |         self.dtype = dtype if dtype else DataType.UNKNOWN
 38 |         self.stats = {}
 39 |         self.cache = {}  # Intermediate heavy calculations
 40 | 
 41 |     def build_stats(self, column, calculator = None):
 42 |         no_nan_col = column.dropna()
 43 |         uniques = no_nan_col.unique()
 44 |         self.stats['num_nan'] = int(column.isna().sum())
 45 |         self.stats['percent_nan'] = float(self.stats['num_nan'] / len(column))
 46 |         self.stats['samples'] = len(column)
 47 |         self.stats['percent_unique'] = float(len(uniques) / self.stats['samples'])
 48 |         self.stats['cardinality'] = len(uniques)
 49 |         self.cache['uniques'] = uniques.tolist()
 50 | 
 51 |         self.cache['no_nan_filtered'] = no_nan_col  # TODO: This could be inefficient
 52 | 
 53 |         return self
 54 | 
 55 |     def __str__(self):
 56 |         return f"Feature (NAME={self.name}, dtype={self.dtype})"
 57 | 
 58 |     def __repr__(self):
 59 |         return self.__str__()
 60 | 
 61 |     def _get_enum_feat_type(self):
 62 |         return FeatType.UNKNOWN
 63 | 
 64 |     @property
 65 |     def as_enum(self):
 66 |         return self._get_enum_feat_type()
 67 | 
 68 |     def to_json(self) -> dict:
 69 |         stats_serialized = self.stats.copy()
 70 | 
 71 |         for key, val in stats_serialized.items():
 72 |             if isinstance(val, int):
 73 |                 stats_serialized[key] = int(val)
 74 |             if isinstance(val, bool):
 75 |                 stats_serialized[key] = bool(val)
 76 |             if isinstance(val, float):
 77 |                 stats_serialized[key] = float(val)
 78 | 
 79 |         return {'name': self.name, 'dtype': self.dtype.value, 'stats': stats_serialized, 'feat_type': self.as_enum.value}
 80 | 
 81 | 
 82 | class BinaryFeature(Feature):
 83 |     """ This class represents a binary feature within a schema
 84 |     (i.e. only two possible values).
 85 | 
 86 |     Args:
 87 |         name (str): Feature name
 88 |         dtype (str): Data type of the feature
 89 |     """
 90 |     def __init__(self, name = None, dtype = None                 ):
 91 | 
 92 |         super().__init__(name, dtype)
 93 | 
 94 |     def build_stats(self, column, calculator):
 95 |         super().build_stats(column, calculator)
 96 |         self.stats['domain'] = self.cache['uniques']
 97 |         calculator.distribution(column, self, bins = 2)
 98 |         return self
 99 | 
100 |     def __str__(self):
101 |         return f"Binary Feature (NAME={self.name}, dtype={self.dtype})"
102 | 
103 |     def __repr__(self):
104 |         return self.__str__()
105 | 
106 |     def _get_enum_feat_type(self):
107 |         return FeatType.BINARY
108 | 
109 | 
110 | class CategoricalFeature(Feature):
111 |     """ This class represents a categorical feature within a schema
112 |     (i.e. only N possible values).
113 | 
114 |     Args:
115 |         name (str): Feature name
116 |         dtype (str): Data type of the feature
117 |     """
118 |     def __init__(self, name = None, dtype = None):
119 | 
120 |         super().__init__(name, dtype)
121 | 
122 |     def build_stats(self, column, calculator):
123 |         super().build_stats(column, calculator)
124 |         limit = calculator.limit_categorical_perc
125 | 
126 |         if isinstance(limit, (int, float)):
127 | 
128 |             if limit <= 0 or limit >= 1:
129 |                 raise ValueError("Input Error: 'limit_categorical_perc' must be a float between 0 and 1")
130 | 
131 |             elif len(self.cache['uniques']) / self.stats['samples'] > limit:
132 |                 warnings.warn(f"{self.name} will be truncated in both statistics 'domain' and 'distribution' with the most frequent values")
133 |                 #  We get the N most frequent values according to the dataset size
134 |                 self.stats['domain'] = list(column.value_counts().index[:int(limit * self.stats['samples'])])
135 | 
136 |             else:  # Low cardinality
137 |                 self.stats['domain'] = self.cache['uniques']
138 | 
139 |         else:
140 |             self.stats['domain'] = self.cache['uniques']
141 | 
142 |         calculator.distribution(column, self)
143 | 
144 |         return self
145 | 
146 |     def __str__(self):
147 |         return f"Categorical Feature (NAME={self.name}, dtype={self.dtype})"
148 | 
149 |     def __repr__(self):
150 |         return self.__str__()
151 | 
152 |     def _get_enum_feat_type(self):
153 |         return FeatType.CATEGORICAL
154 | 
155 | 
156 | class DiscreteFeature(Feature):
157 |     """ This class represents a discrete feature within a schema
158 |     (i.e. any number without decimals).
159 | 
160 |     Args:
161 |         name (str): Feature name
162 |         dtype (str): Data type of the feature
163 |     """
164 |     def __init__(self, name = None, dtype = None):
165 | 
166 |         super().__init__(name, dtype)
167 | 
168 |     def build_stats(self, column, calculator):
169 |         super().build_stats(column, calculator)
170 |         calculator.min(column, self)
171 |         calculator.max(column, self)
172 |         calculator.distribution(column, self)
173 |         return self
174 | 
175 |     def __str__(self):
176 |         return f"Discrete Feature (NAME={self.name}, dtype={self.dtype})"
177 | 
178 |     def __repr__(self):
179 |         return self.__str__()
180 | 
181 |     def _get_enum_feat_type(self):
182 |         return FeatType.DISCRETE
183 | 
184 | 
185 | class ContinuousFeature(Feature):
186 |     """ This class represents a continuous feature within a schema
187 |     (e.g. a float).
188 | 
189 |     Args:
190 |         name (str): Feature name
191 |         dtype (str): Data type of the feature
192 |     """
193 |     def __init__(self, name = None, dtype = None):
194 | 
195 |         super().__init__(name, dtype)
196 | 
197 |     def build_stats(self, column, calculator):
198 |         super().build_stats(column, calculator)
199 |         calculator.min(column, self)
200 |         calculator.max(column, self)
201 |         calculator.mean(column, self)
202 |         calculator.std(column, self)
203 |         calculator.distribution(column, self)
204 |         return self
205 | 
206 |     def __str__(self):
207 |         return f"Continuous Feature (NAME={self.name}, dtype={self.dtype})"
208 | 
209 |     def __repr__(self):
210 |         return self.__str__()
211 | 
212 |     def _get_enum_feat_type(self):
213 |         return FeatType.CONTINUOUS
214 | 
215 | 
216 | class FeatureFactory:
217 | 
218 |     def __init__(self):
219 |         pass
220 | 
221 |     def infer_datatype(self, column: "pandas.Series", feature: Feature) -> DataType:  # noqa: F821
222 |         """ Finds out the data type of the column.
223 | 
224 |         Args:
225 |             column: column which datatype will be inferred
226 |             feature: Feature object. This is needed because we want to cache several internal
227 |                      operations, so future calls are faster.
228 | 
229 |         Returns:
230 |             Returns the datatype of the column
231 |         """
232 |         datatype = DataType.UNKNOWN
233 | 
234 |         if column.dtype.name == 'category':
235 |             datatype = DataType.CATEGORICAL
236 |         elif np.issubdtype(column, np.integer):
237 |             datatype = DataType.INTEGER
238 |         elif np.issubdtype(column, np.bool_):
239 |             datatype = DataType.BOOL
240 |         elif np.issubdtype(column, np.floating):
241 |             datatype = DataType.FLOAT
242 |         elif np.issubdtype(column, np.object_):
243 |             sample = feature.cache['no_nan_filtered'].iloc[0]
244 |             if type(sample) is str:
245 |                 datatype = DataType.STRING
246 |             # TODO: Este tipo puede ser otro array
247 |             # TODO: Este tipo puede ser un json (dict)
248 |             # TODO: Este tipo puede ser un datetime
249 | 
250 |         return datatype
251 | 
252 |     def _build_dummy_feature(self, datatype: DataType, feat_type: FeatType, name: str) -> Feature:
253 |         """ Returns a dummy and uninitialized feature. This method is not intended to be
254 |         used apart from serialization purposes.
255 |         """
256 |         feat = Feature()
257 |         if feat_type == FeatType.BINARY:
258 |             feat = BinaryFeature()
259 |         if feat_type == FeatType.CATEGORICAL:
260 |             feat = CategoricalFeature()
261 |         if feat_type == FeatType.DISCRETE:
262 |             feat = DiscreteFeature()
263 |         if feat_type == FeatType.CONTINUOUS:
264 |             feat = ContinuousFeature()
265 |         feat.dtype = datatype
266 |         feat.name = name
267 | 
268 |         return feat
269 | 
270 |     def _infer_feature_type_from_float(self, feat, threshold_categorical, colname, verbose = False):
271 |         if (feat.cache['no_nan_filtered'] % 1 == 0).all():  # The float column doesn't contain decimals
272 |             if (feat.stats['percent_unique'] < threshold_categorical):
273 |                 # Case Categorical as float
274 |                 if verbose:
275 |                     warnings.warn(
276 |                         f"""FLOAT feature {colname} converted to Categorical because percentage of unique """
277 |                         f"""values {feat.stats['percent_unique']} is lower than threshold {threshold_categorical}""",
278 |                         RuntimeWarning
279 |                     )
280 |                 return FeatType.CATEGORICAL
281 | 
282 |             # Case Discrete as Float
283 |             return FeatType.DISCRETE
284 | 
285 |         # If it does contain decimals, directly create Continuous Feature (categorical feature would rarely be
286 |         # codified as floats with decimals
287 |         return FeatType.CONTINUOUS
288 | 
289 |     def _infer_feature_type_from_int(self, feat, threshold_categorical, colname, verbose = False):
290 |         if feat.stats['percent_unique'] >= threshold_categorical:
291 |             return FeatType.DISCRETE
292 |         else:
293 |             if verbose:
294 |                 warnings.warn(
295 |                     f"""INTEGER feature {colname} converted to Categorical because percentage of unique """
296 |                     f"""values {feat.stats['percent_unique']} is lower than threshold {threshold_categorical}""",
297 |                     RuntimeWarning
298 |                 )
299 |             return FeatType.CATEGORICAL
300 | 
301 |     def build_feature(self,
302 |                       column: 'pandas.Series',  # noqa: F821
303 |                       colname: str = None,
304 |                       threshold_categorical: float = 1e-5,
305 |                       force_feat_type: FeatType = None,
306 |                       verbose: bool = True
307 |                       ) -> Feature:
308 |         """ Builds a schema Feature object given a column.
309 | 
310 |         Args:
311 |             column: Column to be analyzed
312 |             colname: Name of the column (feature)
313 |             threshold_categorical: percentage of necessary unique values for a feature to be considered
314 |                            categorical. If the percentage of unique values < cat_threshold, the
315 |                            column will be taken as categorical. This parameter can be a single float
316 |                            (same threshold for all columns) or a dict in which each key is the name of
317 |                            the column. Use the later for custom thresholds per column.
318 |             force_feat_type: If user wants to force a variable to be of certain type, he/she can use
319 |                             this parameter and its type will not be auto-inferred, but set to this.
320 |             verbose: If this is set to False, possible inner warnings won't be shown.
321 | 
322 |         Returns:
323 |             Feature with only the base statistics calculated
324 |         """
325 |         feat = Feature().build_stats(column)
326 |         datatype = self.infer_datatype(column, feat)
327 |         feat_type = FeatType.UNKNOWN
328 | 
329 |         # If user forces the feature type we kindly fulfill his/her wishes
330 |         if force_feat_type is not None:
331 |             feat = self._build_dummy_feature(datatype, force_feat_type, colname)
332 |             feat.stats.update(feat.stats)
333 |             return feat
334 | 
335 |         if feat.stats['cardinality'] == 2:
336 |             feat_type = FeatType.BINARY
337 |         else:
338 |             # Data could still be either categorical, discrete or continuous
339 |             if datatype is DataType.FLOAT:
340 |                 feat_type = self._infer_feature_type_from_float(feat, threshold_categorical, colname, verbose = verbose)
341 | 
342 |             if datatype is DataType.INTEGER:
343 |                 feat_type = self._infer_feature_type_from_int(feat, threshold_categorical, colname, verbose = verbose)
344 | 
345 |             if (datatype is DataType.STRING) or (datatype is DataType.CATEGORICAL):
346 |                 feat_type = FeatType.CATEGORICAL
347 | 
348 |         featret = self._build_dummy_feature(datatype, feat_type, colname)
349 |         featret.stats.update(feat.stats)
350 |         return featret
351 | 


--------------------------------------------------------------------------------
/tests/dataschema/test_schemagen.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import seaborn as sns
  4 | import pandas as pd
  5 | import numpy as np
  6 | 
  7 | from mercury.dataschema.feature import (
  8 |     CategoricalFeature,
  9 |     BinaryFeature,
 10 |     ContinuousFeature,
 11 |     DiscreteFeature,
 12 |     DataType
 13 | )
 14 | from mercury.dataschema import DataSchema
 15 | from mercury.dataschema.feature import DataType, FeatType
 16 | from mercury.dataschema.anonymize import Anonymize
 17 | 
 18 | 
 19 | @pytest.fixture(scope='module')
 20 | def datasets():
 21 |     tips = sns.load_dataset('tips')
 22 |     tips['sex'] = tips['sex'].astype(str)
 23 |     tips['smoker'] = tips['smoker'].astype(str)
 24 |     tips['day'] = tips['day'].astype(str)
 25 |     tips['time'] = tips['time'].astype(str)
 26 | 
 27 |     titanic = sns.load_dataset('titanic')
 28 |     isna_deck = titanic.deck.isna()
 29 |     titanic['class'] = titanic['class'].astype(str)
 30 |     titanic['deck'] = titanic['deck'].astype(str)
 31 |     titanic['who'] = titanic['who'].astype(str)
 32 |     titanic.loc[isna_deck, 'deck'] = np.nan
 33 | 
 34 |     return tips, titanic
 35 | 
 36 | 
 37 | def test_dataschema_build(datasets):
 38 |     tips, titanic = datasets
 39 | 
 40 |     schma = DataSchema().generate(tips)
 41 | 
 42 |     assert isinstance(schma.feats['sex'], BinaryFeature)
 43 |     assert isinstance(schma.feats['smoker'], BinaryFeature)
 44 |     assert isinstance(schma.feats['time'], BinaryFeature)
 45 |     assert isinstance(schma.feats['size'], CategoricalFeature)
 46 |     assert isinstance(schma.feats['day'], CategoricalFeature)
 47 |     assert isinstance(schma.feats['total_bill'], ContinuousFeature)
 48 |     assert isinstance(schma.feats['tip'], ContinuousFeature)
 49 | 
 50 |     assert schma.feats['sex'].name == 'sex'
 51 |     assert schma.feats['size'].name == 'size'
 52 |     assert schma.feats['total_bill'].name == 'total_bill'
 53 | 
 54 |     schma = DataSchema().generate(titanic)
 55 |     assert isinstance(schma.feats['deck'], CategoricalFeature)
 56 |     assert(schma.feats['deck'].stats['percent_nan'] > 0)
 57 |     assert(schma.feats['adult_male'].dtype == DataType.BOOL)
 58 |     assert schma.feats['adult_male'].name == 'adult_male'
 59 | 
 60 | 
 61 | def test_dataschema_stats(datasets):
 62 |     tips, titanic = datasets
 63 | 
 64 |     schma = DataSchema().generate(tips).calculate_statistics()
 65 | 
 66 |     assert schma.feats['tip'].stats['min'] == 1.0
 67 |     assert schma.feats['tip'].stats['max'] == 10.0
 68 |     assert schma.feats['tip'].stats['mean'] == pytest.approx(2.99827868852459)
 69 |     assert schma.feats['tip'].stats['percent_unique'] == pytest.approx(0.5040983606557377)
 70 | 
 71 |     schma = DataSchema().generate(titanic).calculate_statistics()
 72 |     assert schma.feats['sex'].stats['distribution_bins'][0] == 'female'
 73 |     assert schma.feats['sex'].stats['distribution_bins'][1] == 'male'
 74 |     assert schma.feats['sex'].stats['distribution'][0] == pytest.approx(0.35655738, 0.1)
 75 | 
 76 | 
 77 | def test_dataschema_stats_custom_params(datasets):
 78 |     _, titanic = datasets
 79 |     schma = DataSchema().generate(titanic).calculate_statistics()
 80 |     assert len(schma.feats['age'].stats['distribution']) > 10
 81 | 
 82 |     schma = DataSchema().generate(titanic).calculate_statistics({'distribution_bins_method': 5})
 83 |     assert len(schma.feats['age'].stats['distribution']) == 5
 84 | 
 85 |     schma = DataSchema().generate(titanic).calculate_statistics({
 86 |         'age': {'distribution_bins_method': 5},
 87 |         'fare': {'distribution_bins_method': 3}
 88 |     })
 89 |     assert len(schma.feats['age'].stats['distribution']) == 5
 90 |     assert len(schma.feats['fare'].stats['distribution']) == 3
 91 | 
 92 |     titanic = titanic.reset_index().rename(columns = {'index':'ID'})
 93 |     titanic["ID"] = titanic["ID"].astype(str)
 94 |     schma = (DataSchema().generate(titanic).calculate_statistics({'limit_categorical_perc': 0.05}))
 95 |     assert len(schma.feats['ID'].stats['domain']) == 44
 96 | 
 97 |     with pytest.raises(ValueError) as e:
 98 |         schma = (DataSchema().generate(titanic).calculate_statistics({'limit_categorical_perc': 5}))
 99 |         assert "Input Error: 'limit_categorical_perc' must be a float between 0 and 1" in str(e.value)
100 | 
101 | 
102 | def test_dataschema_stats_anonymize(datasets, tmpdir):
103 |     
104 |     _, titanic = datasets
105 |     test_feat = "class"
106 | 
107 |     #Original
108 |     schema_orig = DataSchema().generate(titanic, verbose = False).calculate_statistics()
109 |     assert sorted(schema_orig.feats[test_feat].stats['domain']) == ['First', 'Second', 'Third']
110 | 
111 |     #Anonymized
112 |     an_encrypt = Anonymize(0)
113 |     an_encrypt.set_key("07jaPY")
114 |     anon_dict = {test_feat : an_encrypt}
115 |     schma_an = schema_orig.anonymize(anonymize_params=anon_dict)
116 |     an_domain = schma_an.feats[test_feat].stats['domain']
117 |     an_dist_bins = schma_an.feats[test_feat].stats['distribution_bins']
118 |     assert 'First' not in an_domain and 'Second' not in an_domain and 'Third' not in an_domain
119 |     assert 'First' not in an_dist_bins and 'Second' not in an_dist_bins and 'Third' not in an_dist_bins
120 | 
121 | 
122 | def test_errors_dataschema_anonymize(datasets):
123 |     tips, titanic = datasets
124 | 
125 |     schma = DataSchema().generate(titanic)
126 |     with pytest.raises(UserWarning) as w:
127 |         schma.anonymize({})
128 |         assert "To anonymise, it is necessary to use a dictionary with the format: {'var1':anonymizer1, 'var2':anonymizer2}" in str(w.value)
129 | 
130 |     an_encrypt = Anonymize(0)
131 |     an_encrypt.set_key("07jaPY")
132 |     with pytest.raises(ValueError) as e:
133 |         schma.anonymize({'fare' : an_encrypt})
134 |         assert "Input Error: Anonymize only supports Categorical or Binary variables ->" in str(e.value)
135 | 
136 |     with pytest.raises(ValueError) as e:
137 |         schma.anonymize({'farer' : an_encrypt})
138 |         assert "Input Error: Keys of 'anonymize_params' dictionary must be columns name of the data schema" in str(e.value)
139 | 
140 |     with pytest.raises(UserWarning) as w:
141 |         schma.deanonymize({})
142 |         assert "To De-anonymise, it is necessary to use a dictionary with the format: {'var1':anonym1, 'var2':anonym2}"
143 | 
144 |     with pytest.raises(ValueError) as e:
145 |         schma.deanonymize({'fare' : an_encrypt})
146 |         assert "Input Error: Deanonymize only supports Categorical or Binary variables ->" in str(e.value)
147 | 
148 |     with pytest.raises(ValueError) as e:
149 |         schma.deanonymize({'farer' : an_encrypt})
150 |         assert "Input Error: Deanonymize only supports Categorical or Binary variables ->" in str(e.value)
151 | 
152 | def test_dataschema_properties(datasets):
153 |     tips, titanic = datasets
154 | 
155 |     schma = DataSchema().generate(titanic)
156 |     assert ['pclass', 'sibsp', 'parch', 'embarked', 'class', 'who', 'deck', 'embark_town'] == schma.categorical_feats
157 |     assert ['age', 'fare'] == schma.continuous_feats
158 |     assert ['survived', 'sex', 'adult_male', 'alive', 'alone'] == schma.binary_feats
159 |     assert len(schma.discrete_feats) == 0
160 | 
161 | 
162 | def test_generate_manual(datasets):
163 |     tips, titanic = datasets
164 |     schma = DataSchema().generate_manual(
165 |         titanic,
166 |         categ_columns=['class'],
167 |         discrete_columns=['age'],
168 |         binary_columns=['survived', 'alive']
169 |     )
170 | 
171 |     for key, item in schma.feats.items():
172 |         if key == 'class':
173 |             assert isinstance(item, CategoricalFeature)
174 | 
175 |         if key == 'age':
176 |             assert isinstance(item, DiscreteFeature)
177 | 
178 |         if key == 'alive' or key == 'survived':
179 |             assert isinstance(item, BinaryFeature)
180 | 
181 |         if key  not in ('class', 'age', 'alive', 'survived'):
182 |             assert isinstance(item, ContinuousFeature)
183 | 
184 |     # assert everything is continuous by default
185 |     schma = DataSchema().generate_manual(
186 |         titanic,
187 |         categ_columns=[],
188 |         discrete_columns=[],
189 |         binary_columns=[]
190 |     )
191 | 
192 |     for _, item in schma.feats.items():
193 |         assert isinstance(item, ContinuousFeature)
194 | 
195 | 
196 | def test_validate(datasets):
197 |     tips, titanic = datasets
198 | 
199 |     titanic2 = titanic.copy()
200 |     titanic2['deck'] = 0
201 | 
202 |     schma = DataSchema().generate(titanic)
203 |     schma2 = DataSchema().generate(titanic2)
204 | 
205 |     with pytest.raises(RuntimeError) as exinfo:
206 |         schma.validate(schma2)
207 | 
208 |     assert "Data types types do not match. 'deck' in other is DataType.INTEGER. However, DataType.STRING is expected." in str(exinfo.value)
209 | 
210 |     titanic2 = titanic.drop('deck', axis=1)
211 |     schma2 = DataSchema().generate(titanic2)
212 | 
213 |     with pytest.raises(RuntimeError) as exinfo:
214 |         schma.validate(schma2)
215 | 
216 |     assert "Features do not match." in str(exinfo.value)
217 | 
218 | 
219 | def test_serialization(datasets, tmpdir):
220 |     tips, titanic = datasets
221 | 
222 |     schma = DataSchema().generate(titanic).calculate_statistics()
223 |     path = str(tmpdir) + '/schema.json'
224 |     schma.save(path)
225 |     recovered = DataSchema.load(path)
226 | 
227 |     # If any of this fail, the serialization is wrong
228 |     schma.validate(recovered)
229 |     recovered.validate(schma)
230 | 
231 | 
232 | def test_get_features_by_type(datasets):
233 |     tips, titanic = datasets
234 |     schema = DataSchema().generate(titanic)
235 | 
236 |     str_feats = {'class', 'alive', 'deck', 'embark_town', 'embarked', 'sex', 'who'}
237 |     float_feats = {'age', 'fare'}
238 |     assert set(schema.get_features_by_type(DataType.STRING)) == set(str_feats)
239 |     assert set(schema.get_features_by_type(DataType.FLOAT)) == set(float_feats)
240 | 
241 | 
242 | def test_subtypes():
243 |     # Test added after bug discovery that float32 were not assigned to continuous
244 | 
245 |     df = pd.DataFrame(data={"float": np.random.uniform(size=1000)})
246 |     df["float_64"] = df["float"].astype(np.float64)
247 |     df["float_32"] = df["float"].astype(np.float32)
248 |     df["float_16"] = df["float"].astype(np.float16)
249 |     df["int_64"] = (df["float_64"] * 10000).astype(np.int64)
250 |     df["int_32"] = (df["float_64"] * 10000).astype(np.int32)
251 |     df["int_16"] = (df["float_64"] * 10000).astype(np.int16)
252 |     df["uint_64"] = (df["float_64"] * 10000).astype(np.uint64)
253 |     df["uint_32"] = (df["float_64"] * 10000).astype(np.uint32)
254 |     df["uint_16"] = (df["float_64"] * 10000).astype(np.uint16)
255 | 
256 |     schema = DataSchema().generate(df)
257 | 
258 |     assert all(elem in schema.continuous_feats  for elem in ['float', 'float_64', 'float_32', 'float_16'])
259 |     assert all(elem in schema.discrete_feats  for elem in ['int_64', 'int_32', 'int_16'])
260 | 
261 | 
262 |     assert isinstance(schema.feats['float_64'], ContinuousFeature)
263 |     assert isinstance(schema.feats['float_32'], ContinuousFeature)
264 |     assert isinstance(schema.feats['float_16'], ContinuousFeature)
265 |     assert isinstance(schema.feats['int_64'], DiscreteFeature)
266 |     assert isinstance(schema.feats['int_32'], DiscreteFeature)
267 |     assert isinstance(schema.feats['int_16'], DiscreteFeature)
268 | 
269 | 
270 |     assert schema.feats['float_64'].dtype == DataType.FLOAT
271 |     assert schema.feats['float_32'].dtype == DataType.FLOAT
272 |     assert schema.feats['float_16'].dtype == DataType.FLOAT
273 |     assert schema.feats['int_64'].dtype == DataType.INTEGER
274 |     assert schema.feats['int_32'].dtype == DataType.INTEGER
275 |     assert schema.feats['int_16'].dtype == DataType.INTEGER
276 | 
277 | 
278 | def test_pandas_categorical_type():
279 |     # Test added after bug discovery that schemas with dataframes with categorical type raise Exception
280 | 
281 |     df = pd.DataFrame(data={
282 |         'categorical_int': np.random.choice([0,1,2,3], size=100),
283 |         'categorical_str': np.random.choice(["A", "B", "C", "D"], size=100)
284 |     })
285 |     df["categorical_int"] = df["categorical_int"].astype("category")
286 |     df["categorical_str"] = df["categorical_str"].astype("category")
287 | 
288 |     schema = DataSchema().generate(df)
289 |     assert all(elem in schema.categorical_feats  for elem in ['categorical_int', 'categorical_str'])
290 |     assert isinstance(schema.feats['categorical_int'], CategoricalFeature)
291 |     assert isinstance(schema.feats['categorical_str'], CategoricalFeature)
292 | 
293 | 
294 | def test_float_conversions():
295 | 
296 |     df = pd.DataFrame(data={
297 |         'float_categorical': np.random.choice([0., 1., 2.], size=1000),
298 |         'float_discrete': np.random.randint(0, 10000, size=1000).astype(float),
299 |         'float_continous': np.random.uniform(0, 10000, size=1000)
300 |     })
301 |     schema = DataSchema().generate(df)
302 |     assert isinstance(schema.feats['float_categorical'], CategoricalFeature)
303 |     assert isinstance(schema.feats['float_discrete'], DiscreteFeature)
304 |     assert isinstance(schema.feats['float_continous'], ContinuousFeature)
305 | 
306 | 
307 | def test_categorical_and_numerical_user_assignation():
308 | 
309 |     # Test cat_feats and num_feats params to manually assign feature types to avoid automatic inference
310 |     df = pd.DataFrame(data={
311 |         'float_categorical': np.random.choice([0., 1., 2.], size=1000),
312 |         'float_discrete': np.random.randint(0, 10000, size=1000).astype(float),
313 |         'float_continous': np.random.uniform(0, 10000, size=1000),
314 |         'int_categorical': np.random.choice([0, 1, 2], size=1000),
315 |         'int_discrete': np.random.randint(0, 10000, size=1000)
316 |     })
317 | 
318 |     # Generate initially with automatic inference
319 |     schema = DataSchema().generate(df)
320 |     assert isinstance(schema.feats['float_categorical'], CategoricalFeature)
321 |     assert isinstance(schema.feats['float_discrete'], DiscreteFeature)
322 |     assert isinstance(schema.feats['float_continous'], ContinuousFeature)
323 |     assert isinstance(schema.feats['int_categorical'], CategoricalFeature)
324 |     assert isinstance(schema.feats['int_discrete'], DiscreteFeature)
325 | 
326 |     # Generate now with manual assignation of data datatypes
327 |     cat_feats = ['float_discrete', 'float_continous', 'int_discrete']
328 |     num_feats = ['float_categorical', 'int_categorical']
329 |     schema = DataSchema().generate(
330 |         df,
331 |         force_types=dict({c: FeatType.CATEGORICAL for c in cat_feats}, **{n: FeatType.DISCRETE for n in num_feats})
332 |     )
333 |     # assert isinstance(schema.feats['float_categorical'], DiscreteFeature)
334 |     # assert isinstance(schema.feats['float_discrete'], CategoricalFeature)
335 |     # assert isinstance(schema.feats['float_continous'], CategoricalFeature)
336 |     # assert isinstance(schema.feats['int_categorical'], DiscreteFeature)
337 |     # assert isinstance(schema.feats['int_discrete'], CategoricalFeature)
338 |     #
339 |     # # Specifying already most correct features types, keeps them
340 |     # schema = DataSchema().generate(
341 |     #     df,
342 |     #     cat_feats=['float_categorical', 'int_categorical'],
343 |     #     num_feats=['int_discrete', 'float_discrete', 'float_continous']
344 |     # )
345 |     # assert isinstance(schema.feats['float_categorical'], CategoricalFeature)
346 |     # assert isinstance(schema.feats['float_discrete'], DiscreteFeature)
347 |     # assert isinstance(schema.feats['float_continous'], ContinuousFeature)
348 |     # assert isinstance(schema.feats['int_categorical'], CategoricalFeature)
349 |     # assert isinstance(schema.feats['int_discrete'], DiscreteFeature)
350 | 
351 |     # If a feature is specified both as numerical and categorical, then an exception is raised
352 |     # with pytest.raises(ValueError) as exinfo:
353 |     #     schema = DataSchema().generate(df, cat_feats=['float_discrete'], num_feats=['float_discrete'])
354 | 
355 |     # String column as a numeric doesn't change it (raises warning)
356 |     df['str_float_categorical'] = df['float_categorical'].astype(str)
357 |     schema = DataSchema().generate(df)
358 |     assert isinstance(schema.feats['str_float_categorical'], CategoricalFeature)
359 | 
360 | 


--------------------------------------------------------------------------------
/mercury/dataschema/schemagen.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import numpy as np
  3 | 
  4 | from typing import Union, List, Dict
  5 | 
  6 | from .feature import (
  7 |     FeatureFactory,
  8 |     ContinuousFeature,
  9 |     DiscreteFeature,
 10 |     CategoricalFeature,
 11 |     BinaryFeature
 12 | )
 13 | 
 14 | from .feature import (
 15 |     DataType,
 16 |     FeatType
 17 | )
 18 | 
 19 | from .calculator import StatCalculatorFactory
 20 | 
 21 | 
 22 | class DataSchema:
 23 |     """ Dataset schema
 24 | 
 25 |     This class takes a dataframe and generates its schema as a collection of feature.
 26 |     Feature objects. Each one of them will contain metadata and statistics about a
 27 |     column of the original dataframe that can be further exploded.
 28 | 
 29 | 
 30 |     Example:
 31 |         ```python
 32 |         >>> schma = DataSchema()\
 33 |         >>>            .generate(dataset)\
 34 |         >>>            .calculate_statistics()
 35 |          'DISBURSED_AMOUNT': Categorical Feature (NAME=DISBURSED_AMOUNT, dtype=DataType.INTEGER),
 36 |          'ASSET_COST': Categorical Feature (NAME=ASSET_COST, dtype=DataType.INTEGER),
 37 |          'LTV': Continuous Feature (NAME=LTV, dtype=DataType.FLOAT),
 38 |          'BUREAU_SCORE': Discrete Feature (NAME=BUREAU_SCORE, dtype=DataType.INTEGER),
 39 |          'BUREAU_SCORE_DESCRIPTION': Categorical Feature (NAME=BUREAU_SCORE_DESCRIPTION, dtype=DataType.STRING),
 40 |          'NEW_LOANS_IN_LAST_SIX_MONTHS': Discrete Feature (NAME=NEW_LOANS_IN_LAST_SIX_MONTHS, dtype=DataType.INTEGER),
 41 |          'DEFAULTED_LOANS_IN_LAST_SIX_MONTHS': Discrete Feature (NAME=DEFAULTED_LOANS_IN_LAST_SIX_MONTHS, dtype=DataType.INTEGER),
 42 |          'NUM_LOANS_TAKEN': Discrete Feature (NAME=NUM_LOANS_TAKEN, dtype=DataType.INTEGER),
 43 |          'NUM_ACTIVE_LOANS': Discrete Feature (NAME=NUM_ACTIVE_LOANS, dtype=DataType.INTEGER),
 44 |          'NUM_DEFAULTED_LOANS': Discrete Feature (NAME=NUM_DEFAULTED_LOANS, dtype=DataType.INTEGER),
 45 |          'AGE': Discrete Feature (NAME=AGE, dtype=DataType.INTEGER),
 46 |          'GENDER': Binary Feature (NAME=GENDER, dtype=DataType.STRING),
 47 |          'CIVIL_STATUS': Categorical Feature (NAME=CIVIL_STATUS, dtype=DataType.STRING),
 48 |          'ORIGIN': Binary Feature (NAME=ORIGIN, dtype=DataType.STRING),
 49 |          'DIGITAL': Binary Feature (NAME=DIGITAL, dtype=DataType.INTEGER),
 50 |          'SCORE': Continuous Feature (NAME=SCORE, dtype=DataType.FLOAT),
 51 |          'PREDICTION': Binary Feature (NAME=PREDICTION, dtype=DataType.INTEGER)}
 52 |         >>> schma.feats['SCORE'].stats
 53 |         {'num_nan': 0,
 54 |         'percent_nan': 0.0,
 55 |         'samples': 233154,
 56 |         'percent_unique': 0.7967352050576014,
 57 |         'cardinality': 185762,
 58 |         'min': 0.17454321487679067,
 59 |         'max': 0.9373813084029072,
 60 |         'mean': 0.7625553210045813,
 61 |         'std': 0.15401509786623635,
 62 |         'distribution': array([7.48617716e-07, 1.07579979e-06, 1.40298186e-06, 1.73016394e-06,
 63 |                 2.05734601e-06, 2.38452809e-06, 2.71171016e-06, 3.03889224e-06,
 64 |                 3.36607431e-06, 3.69325638e-06, 4.02043846e-06])}
 65 |         # Specifying custom parameters (shared among all features) for the calculate_statistics method
 66 |         >>> schma = DataSchema()\
 67 |         ...    .generate(dataset)\
 68 |         ...    .calculate_statistics({'distribution_bins_method': 'sqrt'})  # Specify bin generation method (see numpy.hist)
 69 | 
 70 |         # We can also specify granular statistic parameters per variable
 71 |         >>> schma = DataSchema()\
 72 |         ...    .generate(dataset)\
 73 |         ...    .calculate_statistics({'SCORE': {'distribution_bins_method': 'sqrt'}})  # Specify bin generation method (see numpy.hist)
 74 | 
 75 |         >>> schma = DataSchema()\
 76 |         ...    .generate(dataset)\
 77 |         ...    .calculate_statistics({'SCORE': {'distribution_bins_method': 5}})  # Specify 5 bins only for numerical features
 78 |         ```
 79 |     """
 80 |     def __init__(self):
 81 |         self.dataframe = None
 82 |         self.feats = {}
 83 |         self._feat_factory = None
 84 |         self._generated = False
 85 | 
 86 |     def generate_manual(
 87 |         self,
 88 |         dataframe: Union["pandas.DataFrame", "pyspark.sql.DataFrame"],  # noqa: F821
 89 |         categ_columns: List[str],
 90 |         discrete_columns: List[str],
 91 |         binary_columns: List[str],
 92 |         custom_stats: dict = None,
 93 |     ) -> "DataSchema":
 94 |         """ Builds the schema manually. This acts like `generate()` but in a more restrictive way.
 95 |         All the names passed to `categ_columns` will be taken as categorical features, no more, no less.
 96 |         It will avoid making automatic type inference on every feature not in `categ_columns`.
 97 |         The same rule is applied on `discrete_columns`.
 98 | 
 99 |         Note:
100 |             This method is considered to be low level. If you use this, make sure the type assignment
101 |             to each feature type is compatible with the datatypes (float, int, string,...) in the column or
102 |             a later call to `calculate_statistics` could fail.
103 | 
104 |         Args:
105 |             dataframe (pd.DataFrame): DataFrame on which the schema will be inferred.
106 |             categ_columns (List[str]): list of columns which will be forced to be taken as categorical. Warning:
107 |                           all features not in this list are guaranteed not being categorical
108 |             discrete_columns (List[str]): list of columns which will be forced to be taken as discrete. Warning:
109 |                           all features not in this list are guaranteed not to be taken as discrete (i.e.
110 |                           they will be continuous).
111 |             binary_columns (List[str]): list of column which will be forced to be taken as binary.
112 |             custom_stats (Optional[Dict[str, Any]]): Custom statistics to be calculated for each column.
113 |         """
114 |         force_types = {}
115 |         for col in dataframe.columns:
116 |             if col in categ_columns:
117 |                 force_types[col] = FeatType.CATEGORICAL
118 |             else:
119 |                 # Is in either binary, continuous or discrete lists
120 |                 if col in discrete_columns:
121 |                     force_types[col] = FeatType.DISCRETE
122 |                 elif col in binary_columns:
123 |                     force_types[col] = FeatType.BINARY
124 |                 else:
125 |                     force_types[col] = FeatType.CONTINUOUS
126 | 
127 |         return self.generate(
128 |             dataframe=dataframe,
129 |             force_types=force_types,
130 |             verbose=False,
131 |             custom_stats=custom_stats
132 |         )
133 | 
134 |     def generate(
135 |         self,
136 |         dataframe: Union["pandas.DataFrame", "pyspark.sql.DataFrame"],  # noqa: F821
137 |         force_types: Dict[str, FeatType] = None,
138 |         custom_stats: dict = None,
139 |         verbose: bool = True,
140 |     ) -> "DataSchema":
141 |         """ Builds the schema. For float and integer datatypes, by default the method tries to infer
142 |             if a feature is categorical or numeric (Continuous or Discrete) depending on the percentage
143 |             of unique values. However, that doesn't work in all the cases. In those cases, you can use
144 |             the `force_types` param to specify which features should be categorical and which
145 |             should be numeric independently of the percentage of unique values.
146 | 
147 |         Args:
148 |             dataframe: DataFrame on which the schema will be inferred.
149 |             force_types: Dictionary with the form <FEATURE_NAME, FeatType> that contains the features to be
150 |                         forced to a specific type (Continuous, Discrete, Categorical...)
151 |             custom_stats: Custom statistics to be calculated for each column
152 |             verbose: whether to show or filter all possible warning messages
153 |         """
154 |         if "pyspark" in str(type(dataframe)):
155 |             raise RuntimeError("Sorry, Pyspark is not supported yet...")
156 | 
157 |         self.dataframe = dataframe
158 |         self._generated = True
159 | 
160 |         self._feat_factory = FeatureFactory()
161 | 
162 |         inferring_types = True if force_types is None else False
163 | 
164 |         for col in self.dataframe.columns:
165 |             thresh = self._get_threshold(len(self.dataframe))
166 | 
167 |             # Look if the feature type has been specified
168 |             forced_type = None
169 |             if not inferring_types and col in force_types:
170 |                 forced_type = force_types[col]
171 | 
172 |             feat = self._feat_factory.build_feature(
173 |                 self.dataframe.loc[:, col],
174 |                 col,
175 |                 force_feat_type=forced_type,
176 |                 threshold_categorical=thresh,
177 |                 verbose=inferring_types and verbose  # Only show warnings (if any) when using default args.
178 |             )
179 |             self.feats[col] = feat
180 | 
181 |         return self
182 | 
183 |     def anonymize(self, anonymize_params: dict) -> "DataSchema":
184 |         """
185 |         Anonymize the selected features of a data schema.
186 | 
187 |         Args:
188 |             anonymize_params: Dictionary where the keys are the names of the columns to be anonymized and the values
189 |                               are mercury.contrib.dataschema.Anonymize objects that can be used to anonymize them.
190 |         Raises:
191 |             UserWarning, if anonymize_params is empty.
192 |             ValueError, if the feature selected to deanonymize is not binary or categorical, or is not a feature of the dataschema.
193 |         """
194 |         if not anonymize_params:
195 |             raise UserWarning("To anonymize, it is necessary to use a dictionary with the format: {'var1':anonymizer1, 'var2':anonymizer2}")
196 | 
197 |         if any(feat not in self.feats.keys() for feat in anonymize_params.keys()):
198 |             raise ValueError("Input Error: Keys of 'anonymize_params' dictionary must be columns name of the data schema")
199 | 
200 |         for feature in list(self.feats.keys()):
201 |             anon = anonymize_params.get(feature)
202 | 
203 |             if anon:
204 |                 if not isinstance(self.feats[feature], (BinaryFeature, CategoricalFeature)):
205 |                     raise ValueError(f"Input Error: Anonymze only supports Categorical or Binary variables -> {feature}, You can use \
206 |                                         the `force_types` param in 'generate()' to specify which features should be categorical ")
207 |                 else:
208 |                     self.feats[feature].stats['distribution_bins'] = anon.\
209 |                         anonymize_list_any_type(list(self.feats[feature].stats['distribution_bins']))
210 |                     self.feats[feature].stats['domain'] = anon.\
211 |                         anonymize_list_any_type(list(self.feats[feature].stats['domain']))
212 | 
213 |         return self
214 | 
215 |     def deanonymize(self, anonymize_params: dict) -> "DataSchema":
216 |         """
217 |         De-anonymize the selected features on a preloaded schema.
218 | 
219 |         Raises UserWarning, if anonymize_params is empty.
220 |         Raises ValueError, if the feature selected to deanonymize is not binary or categorical, or is not a feature of the dataschema.
221 | 
222 |         Args:
223 |             anonymize_params: Dictionary where the keys are the names of the columns to be deanonymized and the values
224 |                               are mercury.contrib.dataschema.Anonymize objects that can be used to deanonymize them.
225 |         """
226 |         if not anonymize_params:
227 |             raise UserWarning("To De-anonymize, it is necessary to use a dictionary with the format: {'var1':anonym1, 'var2':anonym2}")
228 | 
229 |         if any(feat not in self.feats.keys() for feat in anonymize_params.keys()):
230 |             raise ValueError("Input Error: Keys of 'anonymize_params' dictionary must be columns name of the data schema")
231 | 
232 |         for feature in list(self.feats.keys()):
233 |             anon = anonymize_params.get(feature)
234 | 
235 |             if anon:
236 | 
237 |                 if not isinstance(self.feats[feature], (BinaryFeature, CategoricalFeature)):
238 |                     raise ValueError(f"Input Error: Deanonymize only supports Categorical or Binary variables -> {feature} ")
239 |                 else:
240 |                     operation = int if self.feats[feature].dtype == DataType.INTEGER else str
241 |                     self.feats[feature].stats['distribution_bins'] = \
242 |                         list(map(operation, anon.deanonymize_list(self.feats[feature].stats['distribution_bins'])))
243 |                     self.feats[feature].stats['domain'] = \
244 |                         list(map(operation, anon.deanonymize_list(self.feats[feature].stats['domain'])))
245 |         return self
246 | 
247 |     def calculate_statistics(
248 |         self,
249 |         calculator_configs: dict = None
250 |     ) -> "DataSchema":
251 |         """ Triggers the computation of all statistics for all registered features
252 |         of the schema.
253 | 
254 |         Args:
255 |             calculator_configs: Optional configurations for each of the calculator parameters.
256 |                                 This can be either a dict or a "dict of dicts". In the first case,
257 |                                 the statistics for ALL FEATURES will be computed with those parameters.
258 |                                 Additionally, you can specify a mapping of [feature_name: {config}] with
259 |                                 granular configurations per feature.
260 |                                 The supported configuration keys are the attributes declared within a calculator class.
261 |                                 See mercury.contrib.dataschema.calculator.PandasStatCalculator (or Spark) for details.
262 |         """
263 |         featnames = list(self.feats.keys())
264 | 
265 |         calculator_configs = calculator_configs if calculator_configs else {}
266 | 
267 |         # User can pass us two  types:
268 |         #  - {'param': 'value', 'param2': 'value'} -> Single config shared for all variables
269 |         #  - {{config_var1}, {config_var2}, {config_var3}, ...} -> 1 config per variable
270 |         multiple_configs = len(calculator_configs) > 0 and isinstance(list(calculator_configs.values())[0], dict)
271 | 
272 |         # Case when user pass a single shared config
273 |         if not multiple_configs:
274 |             calculator = StatCalculatorFactory.build_calculator(self.dataframe)
275 |             calculator.set_config(**calculator_configs)
276 | 
277 |         for feature in featnames:
278 |             if multiple_configs:
279 |                 # Case when user pass one config per variable
280 |                 calculator = StatCalculatorFactory.build_calculator(self.dataframe)
281 |                 if feature in calculator_configs:
282 |                     calculator.set_config(**(calculator_configs[feature]))
283 | 
284 |             # Calculate distributions
285 |             self.feats[feature].build_stats(self.dataframe.loc[:, feature], calculator)
286 | 
287 |         return self
288 | 
289 |     def _get_threshold(self, dataset_size):
290 |         """ Calculates a dynamic threshold for determining whether a variable is categorical
291 |         given the dataset. It uses an asymptotic function (whose lim->0) clipped to a maximum value of 1.
292 |         """
293 |         return np.minimum(1, 50 / (dataset_size))
294 | 
295 |     @property
296 |     def continuous_feats(self) -> List[str]:
297 |         """ List with the names of all continuous features
298 |         """
299 |         return [key for key, feat in self.feats.items() if isinstance(feat, ContinuousFeature)]
300 | 
301 |     @property
302 |     def categorical_feats(self) -> List[str]:
303 |         """ List with the names of all categorical features
304 |         """
305 |         return [key for key, feat in self.feats.items() if isinstance(feat, CategoricalFeature)]
306 | 
307 |     @property
308 |     def binary_feats(self) -> List[str]:
309 |         """ List with the names of all binary features
310 |         """
311 |         return [key for key, feat in self.feats.items() if isinstance(feat, BinaryFeature)]
312 | 
313 |     @property
314 |     def discrete_feats(self) -> List[str]:
315 |         """ List with the names of all discrete features
316 |         """
317 |         return [key for key, feat in self.feats.items() if isinstance(feat, DiscreteFeature)]
318 | 
319 |     def validate(self, other: "DataSchema"):
320 |         """ Validates other schema with this one. The other schema will be considered
321 |         valid if it shares the same feature names and datatypes with this.
322 | 
323 |         Raises RuntimeError if other schema differs from this one
324 | 
325 |         Args:
326 |             other: other schema to be checked from this one
327 |         """
328 |         # Check feature names match
329 |         if list(self.feats.keys()) != list(other.feats.keys()):
330 |             diff = set(self.feats.keys()) - set(other.feats.keys())
331 |             raise RuntimeError(f"Features do not match. These ones are not present on both datasets {list(diff)}")
332 | 
333 |         # Check feature and data types are the same
334 |         for key, item in other.feats.items():
335 |             if not isinstance(item, self.feats[key].__class__):
336 |                 raise RuntimeError(f"""Feature types do not match. '{key}' in other is """
337 |                                    f"""{type(item)}. However, {type(self.feats[key])} is expected.""")
338 | 
339 |             if item.dtype != self.feats[key].dtype:
340 |                 raise RuntimeError(f"""Data types types do not match. '{key}' in other is """
341 |                                    f"""{item.dtype}. However, {self.feats[key].dtype} is expected.""")
342 | 
343 |     def to_json(self) -> dict:
344 |         """ Converts the schema to a JSON representation
345 | 
346 |         Returns:
347 |             dictionary with the features and their stats
348 |         """
349 |         retdict = dict(feats=dict())
350 |         for key, val in self.feats.items():
351 |             retdict['feats'][key] = self.feats[key].to_json()
352 | 
353 |         return retdict
354 | 
355 |     def save(self, path):
356 |         """ Saves a JSON with the schema representation
357 | 
358 |         Args:
359 |             path (str): where the JSON will be saved.
360 |         """
361 |         with open(path, 'w') as file:
362 |             json.dump(self.to_json(), file)
363 | 
364 |     @classmethod
365 |     def load(cls, path: str) -> "DataSchema":
366 |         """ Loads a previously serialized schema (as JSON)
367 | 
368 |         Args:
369 |             path: path to the serialized schema
370 | 
371 |         Returns:
372 |             The rebuilt schema
373 |         """
374 |         with open(path, 'r') as file:
375 |             json_obj = json.load(file)
376 |         schema = cls.from_json(json_obj)
377 |         return schema
378 | 
379 |     @classmethod
380 |     def from_json(cls, json_obj: dict) -> "DataSchema":
381 |         """ Rebuilds an schema from a JSON representation.
382 | 
383 |         Returns:
384 |             The rebuild schema
385 |         """
386 |         schema = DataSchema()
387 |         factory = FeatureFactory()
388 | 
389 |         for featname, feat in json_obj['feats'].items():
390 |             ftype = FeatType[feat['feat_type']]
391 |             dtype = DataType[feat['dtype']]
392 |             feat_name = feat['name']
393 |             dummy_feat = factory._build_dummy_feature(dtype, ftype, feat_name)
394 |             dummy_feat.stats = feat['stats']
395 |             schema.feats[featname] = dummy_feat
396 | 
397 |         return schema
398 | 
399 |     def get_features_by_type(self, datatype: DataType):
400 |         return [key for key, feat in self.feats.items() if feat.dtype == datatype]
401 | 


--------------------------------------------------------------------------------