├── hypex
    ├── forks
    │   ├── __init__.py
    │   └── aa.py
    ├── ui
    │   ├── __init__.py
    │   ├── homo.py
    │   ├── aa.py
    │   ├── ab.py
    │   ├── matching.py
    │   └── base.py
    ├── analyzers
    │   ├── __init__.py
    │   ├── matching.py
    │   └── ab.py
    ├── encoders
    │   ├── __init__.py
    │   ├── encoders.py
    │   └── abstract.py
    ├── factory
    │   ├── __init__.py
    │   └── base.py
    ├── __version__.py
    ├── operators
    │   ├── __init__.py
    │   └── abstract.py
    ├── hypotheses
    │   ├── __init__.py
    │   └── hypothesis.py
    ├── ml
    │   ├── __init__.py
    │   └── faiss.py
    ├── dataset
    │   ├── backends
    │   │   └── __init__.py
    │   ├── __init__.py
    │   ├── roles.py
    │   └── abstract.py
    ├── splitters
    │   ├── __init__.py
    │   └── aa.py
    ├── executor
    │   └── __init__.py
    ├── utils
    │   ├── constants.py
    │   ├── adapter.py
    │   ├── enums.py
    │   ├── typings.py
    │   ├── __init__.py
    │   ├── errors.py
    │   └── decorator.py
    ├── experiments
    │   ├── __init__.py
    │   ├── base.py
    │   └── base_complex.py
    ├── __init__.py
    ├── reporters
    │   ├── __init__.py
    │   ├── homo.py
    │   ├── ab.py
    │   ├── matching.py
    │   ├── abstract.py
    │   └── aa.py
    ├── transformers
    │   ├── __init__.py
    │   ├── abstract.py
    │   ├── shuffle.py
    │   ├── category_agg.py
    │   └── na_filler.py
    ├── comparators
    │   ├── __init__.py
    │   ├── hypothesis_testing.py
    │   ├── comparators.py
    │   ├── power_testing.py
    │   └── distances.py
    ├── extensions
    │   ├── __init__.py
    │   ├── scipy_linalg.py
    │   ├── encoders.py
    │   ├── abstract.py
    │   ├── faiss.py
    │   ├── scipy_stats.py
    │   └── statsmodels.py
    ├── preprocessing.py
    ├── homogeneity.py
    └── ab.py
├── .flake8
├── docs
    ├── requirements.txt
    ├── _templates
    │   ├── functiontemplate.rst
    │   ├── classtemplate.rst
    │   └── autosummary
    │   │   ├── class.rst
    │   │   └── module.rst
    ├── _static
    │   ├── style.css
    │   └── custom.css
    ├── api_reference.rst
    ├── index.rst
    ├── Makefile
    ├── installation.rst
    ├── mock_docs.py
    ├── quickstart.rst
    └── conf.py
├── coverage.sh
├── examples
    └── experiments
    │   └── performance_test
    │       ├── config.json
    │       └── config.schema.json
├── tests
    ├── test_example.py
    └── test_tutorials.py
├── .readthedocs.yaml
├── .github
    ├── PULL_REQUEST_TEMPLATE
    │   ├── docs.md
    │   ├── feature_request.md
    │   └── bug.md
    ├── ISSUE_TEMPLATE
    │   ├── bug.md
    │   ├── feature_request.md
    │   └── question.md
    └── workflows
    │   └── ci.yml
├── tox.ini
├── pyproject.toml
├── .gitignore
└── schemes
    └── architecture_levels.md


/hypex/forks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hypex/ui/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hypex/analyzers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hypex/encoders/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hypex/factory/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hypex/__version__.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.0.2"
2 | 


--------------------------------------------------------------------------------
/hypex/operators/__init__.py:
--------------------------------------------------------------------------------
1 | from .operators import SMD
2 | 
3 | __all__ = ["SMD"]
4 | 


--------------------------------------------------------------------------------
/hypex/hypotheses/__init__.py:
--------------------------------------------------------------------------------
1 | # from .hypothesis import Hypothesis
2 | 
3 | # __all__ = ["Hypothesis"]
4 | 


--------------------------------------------------------------------------------
/hypex/ml/__init__.py:
--------------------------------------------------------------------------------
1 | from .faiss import FaissNearestNeighbors
2 | 
3 | __all__ = ["FaissNearestNeighbors"]
4 | 


--------------------------------------------------------------------------------
/hypex/dataset/backends/__init__.py:
--------------------------------------------------------------------------------
1 | from .pandas_backend import PandasDataset
2 | 
3 | __all__ = ["PandasDataset"]
4 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | ignore = E203, E266, E501, W503
4 | exclude = .venv, .git, __pycache__, build, dist
5 | 


--------------------------------------------------------------------------------
/hypex/splitters/__init__.py:
--------------------------------------------------------------------------------
1 | from .aa import AASplitter, AASplitterWithStratification
2 | 
3 | __all__ = ["AASplitter", "AASplitterWithStratification"]
4 | 


--------------------------------------------------------------------------------
/hypex/executor/__init__.py:
--------------------------------------------------------------------------------
1 | from .executor import Calculator, Executor, IfExecutor, MLExecutor
2 | 
3 | __all__ = ["Calculator", "Executor", "IfExecutor", "MLExecutor"]
4 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | ipykernel
2 | nbsphinx
3 | nbsphinx-link
4 | sphinx-autodoc-typehints
5 | pandoc
6 | jupyter
7 | prompt-toolkit<3.0.0,!=3.0.1,>=2.0.0
8 | sphinx_rtd_theme
9 | 


--------------------------------------------------------------------------------
/hypex/utils/constants.py:
--------------------------------------------------------------------------------
1 | ID_SPLIT_SYMBOL = "\u2534"
2 | NAME_BORDER_SYMBOL = "\u2506"
3 | MATCHING_INDEXES_SPLITTER_SYMBOL = "\u256f"
4 | 
5 | NUMBER_TYPES_LIST = [int, float]
6 | 


--------------------------------------------------------------------------------
/hypex/experiments/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import Experiment, OnRoleExperiment
2 | from .base_complex import CycledExperiment, GroupExperiment
3 | 
4 | __all__ = ["CycledExperiment", "Experiment", "GroupExperiment", "OnRoleExperiment"]
5 | 


--------------------------------------------------------------------------------
/hypex/__init__.py:
--------------------------------------------------------------------------------
1 | from .__version__ import __version__
2 | from .aa import AATest
3 | from .ab import ABTest
4 | from .homogeneity import HomogeneityTest
5 | from .matching import Matching
6 | 
7 | __all__ = ["AATest", "ABTest", "HomogeneityTest", "Matching", "__version__"]
8 | 


--------------------------------------------------------------------------------
/coverage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PYTHONPATH=$PYTHONPATH:.
 4 | export PYTHONWARNINGS="ignore"
 5 | 
 6 | coverage run --include="hypex/dataset/*" unitests/unitests.py
 7 | 
 8 | # coverage report -m
 9 | 
10 | coverage html -d unitests/coverage_report
11 | 
12 | rm -f .coverage


--------------------------------------------------------------------------------
/hypex/reporters/__init__.py:
--------------------------------------------------------------------------------
 1 | from .abstract import DatasetReporter, DictReporter, Reporter
 2 | from .homo import HomoDatasetReporter, HomoDictReporter
 3 | 
 4 | __all__ = [
 5 |     "DatasetReporter",
 6 |     "DictReporter",
 7 |     "HomoDatasetReporter",
 8 |     "HomoDictReporter",
 9 |     "Reporter",
10 | ]
11 | 


--------------------------------------------------------------------------------
/docs/_templates/functiontemplate.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | {{ name | underline }}
 6 | 
 7 | .. autofunction:: {{ fullname }}
 8 | 
 9 | ..
10 |   autogenerated from source/_templates/functiontemplate.rst
11 |   note it does not have :inherited-members:
12 | 


--------------------------------------------------------------------------------
/docs/_templates/classtemplate.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | 
 6 | {{ name | underline }}
 7 | 
 8 | .. autoclass:: {{ name }}
 9 |     :members:
10 | 
11 | 
12 | ..
13 |   autogenerated from source/_templates/classtemplate.rst
14 |   note it does not have :inherited-members:
15 | 


--------------------------------------------------------------------------------
/docs/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | 
 6 | {{ name | underline}}
 7 | 
 8 | .. autoclass:: {{ name }}
 9 |     :members:
10 | 
11 | 
12 | ..
13 |   autogenerated from source/_templates/autosummary/class.rst
14 |   note it does not have :inherited-members:
15 | 


--------------------------------------------------------------------------------
/hypex/ui/homo.py:
--------------------------------------------------------------------------------
 1 | from ..dataset import Dataset, ExperimentData
 2 | from ..reporters.homo import HomoDatasetReporter
 3 | from .base import Output
 4 | 
 5 | 
 6 | class HomoOutput(Output):
 7 |     resume: Dataset
 8 | 
 9 |     def __init__(self):
10 |         super().__init__(resume_reporter=HomoDatasetReporter())
11 | 
12 |     def extract(self, experiment_data: ExperimentData):
13 |         super().extract(experiment_data)
14 | 


--------------------------------------------------------------------------------
/hypex/transformers/__init__.py:
--------------------------------------------------------------------------------
 1 | from ..encoders.encoders import DummyEncoder
 2 | from .category_agg import CategoryAggregator
 3 | from .filters import ConstFilter, CorrFilter, CVFilter, NanFilter, OutliersFilter
 4 | from .na_filler import NaFiller
 5 | from .shuffle import Shuffle
 6 | 
 7 | __all__ = [
 8 |     "CVFilter",
 9 |     "CVFilter",
10 |     "CategoryAggregator",
11 |     "ConstFilter",
12 |     "CorrFilter",
13 |     "DummyEncoder",
14 |     "NaFiller",
15 |     "NanFilter",
16 |     "OutliersFilter",
17 |     "Shuffle",
18 | ]
19 | 


--------------------------------------------------------------------------------
/docs/_static/style.css:
--------------------------------------------------------------------------------
 1 | .wy-nav-content {
 2 |     max-width: none;
 3 | }
 4 | 
 5 | .rst-content code.xref {
 6 |     /* !important prevents the common CSS stylesheets from overriding
 7 |          this as on RTD they are loaded after this stylesheet */
 8 |     color: #E74C3C
 9 | }
10 | 
11 | html.writer-html4 .rst-content dl:not(.docutils) dl:not(.field-list)>dt, html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) dl:not(.field-list)>dt {
12 |     border-left-color: rgb(9, 183, 14)
13 | }
14 | 


--------------------------------------------------------------------------------
/hypex/encoders/encoders.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from ..dataset import Dataset
 4 | from ..extensions.encoders import DummyEncoderExtension
 5 | from .abstract import Encoder
 6 | 
 7 | 
 8 | class DummyEncoder(Encoder):
 9 |     @staticmethod
10 |     def _inner_function(
11 |         data: Dataset, target_cols: str | None = None, **kwargs
12 |     ) -> Dataset:
13 |         if not target_cols:
14 |             return data
15 |         return DummyEncoderExtension().calc(
16 |             data=data, target_cols=target_cols, **kwargs
17 |         )
18 | 


--------------------------------------------------------------------------------
/hypex/comparators/__init__.py:
--------------------------------------------------------------------------------
 1 | from .abstract import Comparator, StatHypothesisTesting
 2 | from .comparators import PSI, GroupDifference, GroupSizes
 3 | from .distances import MahalanobisDistance
 4 | from .hypothesis_testing import Chi2Test, KSTest, TTest, UTest
 5 | from .power_testing import MDEBySize, PowerTesting
 6 | 
 7 | __all__ = [
 8 |     "PSI",
 9 |     "Chi2Test",
10 |     "Comparator",
11 |     "GroupDifference",
12 |     "GroupSizes",
13 |     "KSTest",
14 |     "MDEBySize",
15 |     "MahalanobisDistance",
16 |     "PowerTesting",
17 |     "StatHypothesisTesting",
18 |     "TTest",
19 |     "UTest",
20 | ]
21 | 


--------------------------------------------------------------------------------
/examples/experiments/performance_test/config.json:
--------------------------------------------------------------------------------
 1 | { 
 2 |   "onefactor_params" : {
 3 |     "n_rows": [10, 100],
 4 |     "n_columns": [10, 15],
 5 |     "n_iterations": [10, 20]
 6 |   },
 7 |   "montecarlo_params": {
 8 |     "num_points" : 10,
 9 |     "bounds": {
10 |       "n_rows" : {
11 |           "max" : 10000,
12 |           "min" : 100
13 |       },
14 |       "n_iterations" : {
15 |           "max" : 100,
16 |           "min" : 5
17 |       },
18 |       "n_columns" : {
19 |           "max" : 30,
20 |           "min" : 10
21 |       }
22 |     }
23 |   },
24 |   "fixed_params" : {    
25 |     "n_columns": 10,
26 |     "n_rows": 1000,
27 |     "n_iterations": 5
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/hypex/reporters/homo.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Any
 4 | 
 5 | from ..dataset import Dataset, ExperimentData
 6 | from .aa import OneAADictReporter
 7 | from .abstract import DatasetReporter
 8 | 
 9 | 
10 | class HomoDictReporter(OneAADictReporter):
11 |     def report(self, data: ExperimentData) -> dict[str, Any]:
12 |         return self.extract_data_from_analysis_tables(data)
13 | 
14 | 
15 | class HomoDatasetReporter(DatasetReporter):
16 |     def __init__(self):
17 |         super().__init__(dict_reporter=HomoDictReporter(front=False))
18 | 
19 |     @staticmethod
20 |     def convert_to_dataset(data: dict) -> dict[str, Dataset] | Dataset:
21 |         return HomoDictReporter.convert_flat_dataset(data)
22 | 


--------------------------------------------------------------------------------
/hypex/utils/adapter.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Any, Sequence
 4 | 
 5 | 
 6 | class Adapter:
 7 |     @staticmethod
 8 |     def to_list(data: Any) -> list:
 9 |         if data is None:
10 |             return []
11 |         if isinstance(data, str):
12 |             return [data]
13 |         return list(data) if isinstance(data, Sequence) else [data]
14 | 
15 |     @staticmethod
16 |     def list_to_single(data: list) -> Any:
17 |         if isinstance(data, list):
18 |             if len(data) == 0:
19 |                 return None
20 |             elif len(data) == 1:
21 |                 return data[0]
22 |             else:
23 |                 raise ValueError("Only a list of a single item can be accepted")
24 | 


--------------------------------------------------------------------------------
/hypex/extensions/__init__.py:
--------------------------------------------------------------------------------
 1 | from .encoders import DummyEncoderExtension
 2 | from .faiss import FaissExtension
 3 | from .scipy_linalg import CholeskyExtension, InverseExtension
 4 | from .scipy_stats import (
 5 |                           Chi2TestExtension,
 6 |                           KSTestExtension,
 7 |                           TTestExtension,
 8 |                           UTestExtension,
 9 | )
10 | from .statsmodels import MultiTest, MultitestQuantile
11 | 
12 | __all__ = [
13 |     "Chi2TestExtension",
14 |     "CholeskyExtension",
15 |     "DummyEncoderExtension",
16 |     "FaissExtension",
17 |     "InverseExtension",
18 |     "KSTestExtension",
19 |     "MultiTest",
20 |     "MultitestQuantile",
21 |     "TTestExtension",
22 |     "UTestExtension",
23 | ]
24 | 


--------------------------------------------------------------------------------
/hypex/transformers/abstract.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | 
 3 | from ..dataset import Dataset, ExperimentData
 4 | from ..executor import Calculator
 5 | from ..utils import AbstractMethodError
 6 | 
 7 | 
 8 | class Transformer(Calculator):
 9 |     @property
10 |     def _is_transformer(self):
11 |         return True
12 | 
13 |     @staticmethod
14 |     @abstractmethod
15 |     def _inner_function(data: Dataset, **kwargs) -> Dataset:
16 |         raise AbstractMethodError
17 | 
18 |     @classmethod
19 |     def calc(cls, data: Dataset, **kwargs):
20 |         return cls._inner_function(data, **kwargs)
21 | 
22 |     def execute(self, data: ExperimentData) -> ExperimentData:
23 |         data = data.copy(data=self.calc(data=data.ds))
24 |         return data
25 | 


--------------------------------------------------------------------------------
/docs/api_reference.rst:
--------------------------------------------------------------------------------
 1 | API Reference
 2 | =============
 3 | 
 4 | .. currentmodule:: hypex
 5 | 
 6 | Main Classes
 7 | ------------
 8 | 
 9 | .. autosummary::
10 |    :toctree: _autosummary
11 |    :nosignatures:
12 |    :template: autosummary/class.rst
13 | 
14 |    AATest
15 |    ABTest
16 |    HomogeneityTest
17 |    matching.Matching
18 | 
19 | Dataset Module
20 | --------------
21 | 
22 | .. autosummary::
23 |    :toctree: _autosummary
24 |    :nosignatures:
25 |    :template: autosummary/class.rst
26 | 
27 |    dataset.Dataset
28 |    dataset.ExperimentData
29 | 
30 | Roles
31 | ~~~~~
32 | 
33 | .. autosummary::
34 |    :toctree: _autosummary
35 |    :nosignatures:
36 | 
37 |    dataset.TargetRole
38 |    dataset.TreatmentRole
39 |    dataset.FeatureRole
40 |    dataset.InfoRole
41 | 


--------------------------------------------------------------------------------
/tests/test_example.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.mark.parametrize("test_input, expected", [(1, 1), (2, 2), (3, 3)])
 7 | def test_example(test_input: Any, expected: Any) -> None:
 8 |     """
 9 |     Tests if the input values are equal to the expected values.
10 | 
11 |     This test uses parametrization to check multiple pairs of values.
12 |     It ensures that each input argument is equal to its expected value.
13 | 
14 |     Args:
15 |         test_input: The input value for the test.
16 |         expected: The expected value to compare against the input.
17 | 
18 |     Returns:
19 |         None. The test simply asserts the condition.
20 |     """
21 |     assert test_input == expected, f"Expected {expected}, got {test_input}"
22 |     return
23 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. HypEx documentation master file
 2 | 
 3 | Welcome to HypEx's documentation!
 4 | ==================================
 5 | 
 6 | HypEx is a fast and customizable framework for Causal Inference.
 7 | 
 8 | .. toctree::
 9 |    :maxdepth: 2
10 |    :caption: Contents:
11 | 
12 |    installation
13 |    quickstart
14 |    api_reference
15 | 
16 | Installation
17 | ------------
18 | 
19 | .. code-block:: bash
20 | 
21 |    pip install hypex
22 | 
23 | Quick Start
24 | -----------
25 | 
26 | .. code-block:: python
27 | 
28 |    from hypex import ABTest, AATest, Matching
29 |    from hypex.dataset import Dataset, TargetRole, TreatmentRole
30 | 
31 |    # Your code here
32 | 
33 | Indices and tables
34 | ==================
35 | 
36 | * :ref:`genindex`
37 | * :ref:`modindex`
38 | * :ref:`search`
39 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the OS, Python version and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.10"
13 | 
14 | # Build documentation in the "docs/" directory with Sphinx
15 | sphinx:
16 |    configuration: docs/conf.py
17 | 
18 | # Optionally build your docs in additional formats such as PDF and ePub
19 | formats: all
20 | 
21 | # Optional but recommended, declare the Python requirements required
22 | # to build your documentation
23 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
24 | python:
25 |   install:
26 |     - requirements: docs/requirements.txt
27 |     - path: .
28 | 


--------------------------------------------------------------------------------
/hypex/preprocessing.py:
--------------------------------------------------------------------------------
 1 | from .encoders.encoders import DummyEncoder
 2 | from .experiments.base import Experiment
 3 | from .transformers.category_agg import CategoryAggregator
 4 | from .transformers.filters import (
 5 |                                    ConstFilter,
 6 |                                    CorrFilter,
 7 |                                    CVFilter,
 8 |                                    NanFilter,
 9 |                                    OutliersFilter,
10 | )
11 | from .transformers.na_filler import NaFiller
12 | 
13 | PREPROCESSING_DATA = Experiment(
14 |     executors=[
15 |         NaFiller(method="ffill"),
16 |         CategoryAggregator(),
17 |         CorrFilter(),
18 |         CVFilter(),
19 |         NanFilter(),
20 |         ConstFilter(),
21 |         OutliersFilter(lower_percentile=0.05, upper_percentile=0.95),
22 |         DummyEncoder(),
23 |     ]
24 | )
25 | 


--------------------------------------------------------------------------------
/hypex/transformers/shuffle.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Any
 4 | 
 5 | from ..dataset import Dataset, ExperimentData
 6 | from ..executor.executor import Calculator
 7 | 
 8 | 
 9 | class Shuffle(Calculator):
10 |     def __init__(
11 |         self,
12 |         random_state: int | None = None,
13 |         key: Any = "",
14 |     ):
15 |         super().__init__(key)
16 |         self.random_state = random_state
17 | 
18 |     @staticmethod
19 |     def _inner_function(data: Dataset, random_state: int | None = None) -> Dataset:
20 |         return data.shuffle(random_state=random_state)
21 | 
22 |     def generate_params_hash(self):
23 |         return f"{self.random_state}"
24 | 
25 |     def execute(self, data: ExperimentData) -> ExperimentData:
26 |         result = data.copy(data=self.calc(data=data.ds, random_state=self.random_state))
27 |         return result
28 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 
22 | clean:
23 | 	sphinx-build -M clean "$(SOURCEDIR)" "$(BUILDDIR)"
24 | 	sphinx-build -M clean "$(SOURCEDIR)" "imgs"
25 | 	sphinx-build -M clean "$(SOURCEDIR)" "pages/modules/generated/"
26 | 	rm -rf "$(SOURCEDIR)/_autosummary"
27 | 


--------------------------------------------------------------------------------
/hypex/extensions/scipy_linalg.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd  # type: ignore
 3 | 
 4 | from ..dataset import Dataset
 5 | from ..dataset.roles import FeatureRole
 6 | from .abstract import Extension
 7 | 
 8 | 
 9 | class CholeskyExtension(Extension):
10 |     def _calc_pandas(self, data: Dataset, epsilon: float = 1e-3, **kwargs):
11 |         cov = data.data.to_numpy()
12 |         cov = cov + np.eye(cov.shape[0]) * epsilon
13 |         return self.result_to_dataset(
14 |             pd.DataFrame(np.linalg.cholesky(cov), columns=data.columns),
15 |             {column: FeatureRole() for column in data.columns},
16 |         )
17 | 
18 | 
19 | class InverseExtension(Extension):
20 |     def _calc_pandas(self, data: Dataset, **kwargs):
21 |         return self.result_to_dataset(
22 |             pd.DataFrame(np.linalg.inv(data.data.to_numpy()), columns=data.columns),
23 |             {column: FeatureRole() for column in data.columns},
24 |         )
25 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | Requirements
 5 | ------------
 6 | 
 7 | * Python 3.8 or higher
 8 | * NumPy
 9 | * Pandas
10 | * SciPy
11 | * Scikit-learn
12 | * Statsmodels
13 | 
14 | Basic Installation
15 | ------------------
16 | 
17 | Install HypEx using pip:
18 | 
19 | .. code-block:: bash
20 | 
21 |     pip install hypex
22 | 
23 | Development Installation
24 | ------------------------
25 | 
26 | For development, clone the repository and install in editable mode:
27 | 
28 | .. code-block:: bash
29 | 
30 |     git clone https://github.com/sb-ai-lab/HypEx.git
31 |     cd HypEx
32 |     pip install -e .
33 | 
34 | Optional Dependencies
35 | ---------------------
36 | 
37 | For additional functionality, install with extras:
38 | 
39 | .. code-block:: bash
40 | 
41 |     # For CatBoost support
42 |     pip install hypex[cat]
43 | 
44 |     # For LightGBM support
45 |     pip install hypex[lgbm]
46 | 
47 |     # All extras
48 |     pip install hypex[all]
49 | 


--------------------------------------------------------------------------------
/hypex/extensions/encoders.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import copy
 4 | 
 5 | import pandas as pd  # type: ignore
 6 | 
 7 | from ..dataset import Dataset, DatasetAdapter
 8 | from .abstract import Extension
 9 | 
10 | 
11 | class DummyEncoderExtension(
12 |     Extension
13 | ):  # TODO: role types are being rewritten, needs to be fixed
14 |     @staticmethod
15 |     def _calc_pandas(data: Dataset, target_cols: str | None = None, **kwargs):
16 |         dummies_df = pd.get_dummies(data=data[target_cols].data, drop_first=True)
17 |         # Setting roles to the dummies in additional fields based on the original
18 |         # roles by searching based on the part of the dummy column name
19 |         roles = {col: data.roles[col[: col.rfind("_")]] for col in dummies_df.columns}
20 |         new_roles = copy.deepcopy(roles)
21 |         for role in roles.values():
22 |             role.data_type = bool
23 |         return DatasetAdapter.to_dataset(dummies_df, roles=new_roles)
24 | 


--------------------------------------------------------------------------------
/hypex/utils/enums.py:
--------------------------------------------------------------------------------
 1 | import enum
 2 | 
 3 | 
 4 | @enum.unique
 5 | class ExperimentDataEnum(enum.Enum):
 6 |     variables = "variables"
 7 |     additional_fields = "additional_fields"
 8 |     analysis_tables = "analysis_tables"
 9 |     groups = "groups"
10 | 
11 | 
12 | @enum.unique
13 | class BackendsEnum(enum.Enum):
14 |     pandas = "pandas"
15 | 
16 | 
17 | @enum.unique
18 | class SpaceEnum(enum.Enum):
19 |     auto = "auto"
20 |     additional = "additional"
21 |     data = "data"
22 | 
23 | 
24 | @enum.unique
25 | class ABNTestMethodsEnum(enum.Enum):
26 |     bonferroni = "bonferroni"
27 |     sidak = "sidak"
28 |     holm_sidak = "holm-sidak"
29 |     holm = "holm"
30 |     simes_hochberg = "simes-hochberg"
31 |     hommel = "hommel"
32 |     fdr_bh = "fdr_bh"
33 |     fdr_by = "fdr_by"
34 |     fdr_tsbh = "fdr_tsbh"
35 |     fdr_tsbky = "fdr_tsbky"
36 |     quantile = "quantile"
37 | 
38 | 
39 | @enum.unique
40 | class RenameEnum(enum.Enum):
41 |     all = "all"
42 |     columns = "columns"
43 |     index = "index"
44 | 


--------------------------------------------------------------------------------
/docs/_templates/autosummary/module.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |    :class: hidden-section
 3 | 
 4 | {{ name | underline }}
 5 | 
 6 | .. automodule:: {{ fullname }}
 7 | 
 8 | {% block classes %}
 9 | {% if classes %}
10 | 
11 | .. rubric:: {{ _('Classes') }}
12 | 
13 | .. autosummary::
14 |    :toctree: generated
15 |    :nosignatures:
16 |    :template: classtemplate.rst
17 | 
18 |    {% for item in classes %}
19 |    {{ item }}
20 |    {% endfor %}
21 | {% endif %}
22 | {% endblock %}
23 | 
24 | {% block functions %}
25 | {% if functions %}
26 | 
27 | .. rubric:: {{ _('Functions') }}
28 | 
29 | .. autosummary::
30 |    :toctree: generated
31 |    :nosignatures:
32 |    :template: functiontemplate.rst
33 | 
34 |    {% for item in functions %}
35 |    {{ item }}
36 |    {% endfor %}
37 | {% endif %}
38 | {% endblock %}
39 | 
40 | {% block modules %}
41 | {% if modules %}
42 | 
43 | .. rubric:: {{ _('Modules') }}
44 | 
45 | .. autosummary::
46 |    :toctree:
47 |    :recursive:
48 | 
49 |    {% for item in modules %}
50 |    {{ item }}
51 |    {% endfor %}
52 | {% endif %}
53 | {% endblock %}
54 | 


--------------------------------------------------------------------------------
/hypex/utils/typings.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | from typing import (
 3 |     TYPE_CHECKING,
 4 |     Any,
 5 |     Callable,
 6 |     Dict,
 7 |     List,
 8 |     Sequence,
 9 |     Tuple,
10 |     TypeVar,
11 |     Union,
12 | )
13 | 
14 | if TYPE_CHECKING:
15 |     from hypex.dataset import Dataset
16 | 
17 | StratificationRoleTypes = Union[float, str, datetime.datetime]
18 | DefaultRoleTypes = Union[float, bool, str, int]
19 | TargetRoleTypes = Union[float, int, bool]
20 | CategoricalTypes = Union[str]
21 | ScalarType = Union[float, int, str, bool]
22 | GroupingDataType = Tuple[List[Tuple[str, "Dataset"]], List[Tuple[str, "Dataset"]]]
23 | 
24 | 
25 | MultiFieldKeyTypes = Union[str, Sequence[str]]
26 | 
27 | FromDictTypes = Union[
28 |     Dict[str, List[Any]],
29 |     List[Dict[Any, Any]],
30 |     Dict[str, Dict[Any, List]],
31 |     Dict[str, "Dataset"],
32 | ]
33 | RoleNameType = str
34 | DecoratedType = TypeVar("DecoratedType", bound=Union[Callable[..., Any], property])
35 | DocstringInheritDecorator = Callable[[DecoratedType], DecoratedType]
36 | 
37 | SetParamsDictTypes = Union[Dict[str, Any], Dict[type, Dict[str, Any]]]
38 | 


--------------------------------------------------------------------------------
/hypex/analyzers/matching.py:
--------------------------------------------------------------------------------
 1 | from ..dataset.dataset import DatasetAdapter, ExperimentData
 2 | from ..dataset.roles import StatisticRole
 3 | from ..executor.executor import Executor
 4 | from ..operators.operators import MatchingMetrics
 5 | from ..utils.enums import ExperimentDataEnum
 6 | 
 7 | 
 8 | class MatchingAnalyzer(Executor):
 9 |     def _set_value(self, data: ExperimentData, value, key=None) -> ExperimentData:
10 |         return data.set_value(
11 |             ExperimentDataEnum.analysis_tables, self.id, value, key=key
12 |         )
13 | 
14 |     def execute(self, data: ExperimentData):
15 |         variables = data.variables[
16 |             data.get_one_id(MatchingMetrics, space=ExperimentDataEnum.variables)
17 |         ]
18 |         columns = ["Effect Size", "Standard Error", "P-value", "CI Lower", "CI Upper"]
19 |         return self._set_value(
20 |             data,
21 |             DatasetAdapter.to_dataset(
22 |                 variables,
23 |                 {field: StatisticRole() for field in list(variables.keys())},
24 |             ).transpose(roles={column: StatisticRole() for column in columns}),
25 |         )
26 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/docs.md:
--------------------------------------------------------------------------------
 1 | ### Documentation Update Description
 2 | 
 3 | <!-- Provide a brief description of what the documentation update entails and the reason for the changes. -->
 4 | 
 5 | ### Areas of Documentation Updated
 6 | 
 7 | <!-- List the sections or pages of the documentation that have been updated. -->
 8 | 
 9 | 1.
10 | 2.
11 | 3.
12 | 
13 | ### Details of Changes
14 | 
15 | <!-- Describe the specific changes made to the documentation. Include reasons for changes, if not obvious. -->
16 | 
17 | ### Screenshots / Code Snippets
18 | 
19 | <!-- If applicable, add screenshots or code snippets to help explain the changes. -->
20 | 
21 | ### Related Issues or Pull Requests
22 | 
23 | <!-- Link any related issues or previous pull requests that are relevant to this documentation update. -->
24 | 
25 | ### Additional Notes
26 | 
27 | <!-- Include any additional information that might be helpful for reviewers. -->
28 | 
29 | ### Checklist
30 | 
31 | - [ ] The changes are clear and easy to understand.
32 | - [ ] I have verified that the changes are accurate and necessary.
33 | - [ ] The updated documentation has been tested for clarity and comprehensibility.
34 | - [ ] All modified sections are properly formatted and adhere to project documentation standards.
35 | 


--------------------------------------------------------------------------------
/hypex/forks/aa.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from ..analyzers.aa import OneAAStatAnalyzer
 4 | from ..executor.executor import Executor, IfExecutor
 5 | from ..utils.enums import ExperimentDataEnum
 6 | 
 7 | 
 8 | class IfAAExecutor(IfExecutor):
 9 |     def __init__(
10 |         self,
11 |         if_executor: Executor | None = None,
12 |         else_executor: Executor | None = None,
13 |         sample_size: float | None = None,
14 |         key: str = "",
15 |     ):
16 |         self.sample_size = sample_size
17 |         super().__init__(if_executor, else_executor, key)
18 | 
19 |     def check_rule(self, data, **kwargs) -> bool:
20 |         if self.sample_size is not None:
21 |             score_table_id = data.get_one_id(
22 |                 OneAAStatAnalyzer,
23 |                 ExperimentDataEnum.analysis_tables,
24 |             )
25 |             score_table = data.analysis_tables[score_table_id]
26 |             feature_pass = sum(
27 |                 [
28 |                     score_table.loc[:, column].get_values()[0][0]
29 |                     for column in score_table.columns
30 |                     if "pass" in column
31 |                 ]
32 |             )
33 |             return True if feature_pass >= 1 else False
34 |         return False
35 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ### Description
 2 | 
 3 | <!-- Briefly describe the problem or user story this PR addresses. -->
 4 | 
 5 | ### Changes Made
 6 | 
 7 | <!-- Detail the code changes made. Include code snippets or screenshots as needed. -->
 8 | 
 9 | ### Related Issues
10 | 
11 | <!-- Link to related issues or feature requests. -->
12 | 
13 | ### Additional Notes
14 | 
15 | <!-- Include any extra information or considerations for reviewers, such as impacted areas of the codebase or specific areas needing thorough review. -->
16 | 
17 | ### Testing and Validation
18 | 
19 | <!-- Describe how the changes have been tested and validated. -->
20 | 
21 | ### Performance Considerations
22 | 
23 | <!-- Discuss any performance implications of the changes. -->
24 | 
25 | ### Breaking Changes
26 | 
27 | <!-- Indicate if the changes introduce any breaking changes and how they have been handled. -->
28 | 
29 | ### Dependencies
30 | 
31 | <!-- List any new dependencies introduced by this PR and reasons for their inclusion. -->
32 | 
33 | ### Merge Request Checklist
34 | 
35 | - [ ] Code follows project coding guidelines.
36 | - [ ] Documentation reflects the changes made.
37 | - [ ] Unit tests cover new or changed functionality.
38 | - [ ] Performance and breaking changes have been considered.
39 | 


--------------------------------------------------------------------------------
/docs/mock_docs.py:
--------------------------------------------------------------------------------
 1 | """A one line summary of the module or program, terminated by a period.
 2 | 
 3 | Leave one blank line.  The rest of this docstring should contain an
 4 | overall description of the module or program.  Optionally, it may also
 5 | contain a brief description of exported classes and functions and/or usage
 6 | examples.
 7 | 
 8 |     Typical usage example:
 9 | 
10 |         >>> print('something')
11 |         something
12 |         >>> a = MyClass('be', 'or', 'not')
13 | 
14 | """
15 | 
16 | import datetime
17 | 
18 | 
19 | class MyClass:
20 |     """Description of class.
21 | 
22 |     Really do nothing.
23 | 
24 |     Attributes:
25 |         attr1 (str): Description of `attr1`.
26 |         attr2 (str): Description of `attr2`.
27 | 
28 |     Args:
29 |         attr1: Description of `attr1`.
30 |         attr2: Description of `attr2`.
31 | 
32 | 
33 |     """
34 | 
35 |     def __init__(self, attr1: str, attr2: str):
36 |         self.attr1 = attr1
37 |         self.attr2 = attr2
38 |         date = datetime.datetime.now()
39 |         print(
40 |             f"{date.day}.{date.month}.{date.year} {date.hour}:{date.minute}:{date.second}"
41 |         )
42 | 
43 | 
44 | # .. toctree::
45 | #     :glob:
46 | #     :maxdepth: 1
47 | #     :caption: Tutorials
48 | #
49 | #     tutorials/tutor_1.ipynb
50 | #     tutorials/tutor_2.ipynb
51 | #     tutorials/tutor_3.ipynb
52 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug Report
 3 | about: Create a report to help us improve
 4 | title: '[BUG] '
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## 🐛 Bug Description
11 | 
12 | <!-- A clear and concise description of what the bug is. -->
13 | 
14 | ### Steps To Reproduce
15 | 
16 | <!-- Steps to reproduce the behavior: -->
17 | 
18 | 1. Go to '...'
19 | 2. Click on '....'
20 | 3. Scroll down to '....'
21 | 4. See error
22 | 
23 | ### Expected Behavior
24 | 
25 | <!-- A clear and concise description of what you expected to happen. -->
26 | 
27 | ### Screenshots
28 | 
29 | <!-- If applicable, add screenshots to help explain your problem. -->
30 | 
31 | ### Environment
32 | 
33 | <!-- Please complete the following information -->
34 | 
35 | - HypEx Version: [e.g. 0.0.4]
36 | - Python Version: [e.g. 3.8]
37 | - Operating System: [e.g. iOS, Windows, Linux]
38 | 
39 | ### Additional Context
40 | 
41 | <!-- Add any other context about the problem here. -->
42 | 
43 | ### Possible Solution
44 | 
45 | <!-- If you have suggestions on a fix for the bug, please describe it here. -->
46 | 
47 | ### Checklist
48 | 
49 | - [ ] I have described the bug in detail
50 | - [ ] I have provided steps to reproduce
51 | - [ ] I have provided the expected behavior
52 | - [ ] I have provided screenshots (if applicable)
53 | - [ ] I have provided my environment details
54 | - [ ] I have suggested a possible solution (if applicable)
55 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature Request
 3 | about: Suggest an idea for this project
 4 | title: '[FEATURE] '
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## 🚀 Feature Proposal
11 | 
12 | <!-- A clear and concise description of what the feature is. -->
13 | 
14 | ### Motivation
15 | 
16 | <!-- Please outline the motivation for the proposal. Is your feature request related to a problem? If this is related to another GitHub issue, please link it here. -->
17 | 
18 | ### Feature Description
19 | 
20 | <!-- A detailed description of the feature you are proposing. -->
21 | 
22 | ### Potential Impacts
23 | 
24 | <!-- Discuss any potential impacts this feature may have on the project. This can include:
25 | - Performance considerations
26 | - Compatibility issues
27 | - Dependencies on other features or components
28 | -->
29 | 
30 | ### Alternatives
31 | 
32 | <!-- A clear and concise description of any alternative solutions or features you've considered. -->
33 | 
34 | ### Additional Context
35 | 
36 | <!-- Add any other context, screenshots, or code snippets about the feature request here. -->
37 | 
38 | ### Checklist
39 | 
40 | - [ ] I have clearly described the feature.
41 | - [ ] I have outlined the motivation for the proposal.
42 | - [ ] I have provided a detailed description of the feature.
43 | - [ ] I have discussed potential impacts and alternatives.
44 | - [ ] I have added any additional context or screenshots.
45 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/bug.md:
--------------------------------------------------------------------------------
 1 | ### Bug Description
 2 | 
 3 | <!-- Briefly describe the bug that this PR addresses. Include relevant issue numbers if applicable. -->
 4 | 
 5 | ### Steps to Reproduce
 6 | 
 7 | <!-- List the steps to reproduce the behavior. This helps reviewers to verify the bug and understand the context. -->
 8 | 
 9 | 1.
10 | 2.
11 | 3.
12 | 4.
13 | 
14 | ### Expected Behavior
15 | 
16 | <!-- Describe what should happen ideally after your changes are applied. -->
17 | 
18 | ### Actual Behavior
19 | 
20 | <!-- Describe what is actually happening. Include screenshots or error messages if applicable. -->
21 | 
22 | ### Changes Made
23 | 
24 | <!-- Summarize the changes made to fix the bug. Provide code snippets or screenshots as needed. -->
25 | 
26 | ### Testing Performed
27 | 
28 | <!-- Describe the tests you ran to verify your changes. Include instructions so reviewers can reproduce. -->
29 | 
30 | ### Related Issues
31 | 
32 | <!-- Link any related issues here. This helps to track the history and context of the bug. -->
33 | 
34 | ### Additional Notes
35 | 
36 | <!-- Include any extra information or considerations for reviewers. -->
37 | 
38 | ### Checklist
39 | 
40 | - [ ] The code follows project coding guidelines.
41 | - [ ] I have added tests to cover my changes.
42 | - [ ] All new and existing tests passed.
43 | - [ ] Documentation has been updated to reflect the changes made.
44 | - [ ] I have verified that the changes fix the issue as described.
45 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: How to Question
 3 | about: Ask for guidance or clarification on how to use the project
 4 | title: '[QUESTION] '
 5 | labels: help wanted, question
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## ❓ How to Question
11 | 
12 | ### Context
13 | 
14 | <!-- Provide a brief description of what you're trying to accomplish with the project. This context will help others understand your question better. -->
15 | 
16 | ### Question
17 | 
18 | <!-- Clearly state your question. Be as specific as possible to help others understand what you're looking for. -->
19 | 
20 | ### What I've Tried
21 | 
22 | <!-- Describe what you've tried so far and what you've learned. This helps avoid redundant suggestions and shows that you've made an effort to find a solution. -->
23 | 
24 | ### Code (if applicable)
25 | 
26 | <!-- If your question involves code, please share the relevant parts here. Format your code for readability. -->
27 | 
28 | ### Research
29 | 
30 | 1. Have you searched for similar questions in existing issues?
31 | 2. Have you consulted the documentation to find an answer?
32 | 
33 | ### Additional Context
34 | 
35 | <!-- Add any other context, links, or screenshots about your question here. -->
36 | 
37 | ### Checklist
38 | 
39 | - [ ] I have provided context for my question.
40 | - [ ] I have stated my question clearly.
41 | - [ ] I have shared what I've tried and what I've learned.
42 | - [ ] I have checked existing issues and documentation.
43 | - [ ] I have provided code and additional context if applicable.
44 | 


--------------------------------------------------------------------------------
/docs/quickstart.rst:
--------------------------------------------------------------------------------
 1 | Quick Start Guide
 2 | =================
 3 | 
 4 | This guide will help you get started with HypEx.
 5 | 
 6 | Basic Usage
 7 | -----------
 8 | 
 9 | A/B Testing
10 | ~~~~~~~~~~~
11 | 
12 | .. code-block:: python
13 | 
14 |     from hypex import ABTest
15 |     from hypex.dataset import Dataset, TargetRole, TreatmentRole
16 |     import pandas as pd
17 | 
18 |     # Load your data
19 |     df = pd.read_csv('your_data.csv')
20 | 
21 |     # Create dataset with roles
22 |     data = Dataset(
23 |         roles={
24 |             'conversion': TargetRole(),
25 |             'group': TreatmentRole(),
26 |             'feature1': FeatureRole(),
27 |             'feature2': FeatureRole()
28 |         },
29 |         data=df
30 |     )
31 | 
32 |     # Run A/B test
33 |     ab_test = ABTest()
34 |     results = ab_test.execute(data)
35 | 
36 |     # View results
37 |     print(results.resume)
38 | 
39 | A/A Testing
40 | ~~~~~~~~~~~
41 | 
42 | .. code-block:: python
43 | 
44 |     from hypex import AATest
45 | 
46 |     # Run A/A test to check for sample ratio mismatch
47 |     aa_test = AATest(
48 |         n_iterations=100,
49 |         stratification=True
50 |     )
51 |     results = aa_test.execute(data)
52 | 
53 |     # Check if splits are good
54 |     print(results.resume)
55 | 
56 | Matching
57 | ~~~~~~~~
58 | 
59 | .. code-block:: python
60 | 
61 |     from hypex import Matching
62 | 
63 |     # Perform matching analysis
64 |     matching = Matching(
65 |         distance="mahalanobis",
66 |         metric="att"
67 |     )
68 |     results = matching.execute(data)
69 | 
70 |     # View matched pairs and treatment effects
71 |     print(results.resume)
72 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Continuous Integration
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |     paths-ignore:
 7 |       - "docs/**"
 8 |       - "*.md"
 9 |   pull_request:
10 |     branches: [ master ]
11 |     paths-ignore:
12 |       - "docs/**"
13 |       - "*.md"
14 | 
15 | jobs:
16 |   test:
17 |     runs-on: ${{ matrix.os }}
18 |     strategy:
19 |       matrix:
20 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
21 |         os: [ubuntu-latest, macos-latest, windows-latest]
22 |     steps:
23 |       - uses: actions/checkout@v4
24 | 
25 |       - name: Set up Python ${{ matrix.python-version }}
26 |         uses: actions/setup-python@v4
27 |         with:
28 |           python-version: ${{ matrix.python-version }}
29 | 
30 |       - name: Install dependencies for tox
31 |         run: |
32 |           python -m pip install --upgrade pip
33 |           pip install tox
34 |           pip install poetry
35 |           pip install pytest
36 |           poetry install --no-root --without dev
37 | 
38 |       - name: Run unit tests
39 |         run: tox -e py
40 | 
41 |   linters:
42 |     runs-on: ubuntu-latest  # Линтеры и доки только на одной версии
43 |     steps:
44 |       - uses: actions/checkout@v4
45 | 
46 |       - name: Set up Python 3.10
47 |         uses: actions/setup-python@v4
48 |         with:
49 |           python-version: "3.10"
50 | 
51 |       - name: Install dependencies for linters
52 |         run: |
53 |           python -m pip install --upgrade pip
54 |           pip install tox
55 |           pip install poetry
56 |           poetry install --no-root
57 | 
58 |       - name: Run linters (mypy, codespell, docs)
59 |         run: |
60 |           tox


--------------------------------------------------------------------------------
/hypex/reporters/ab.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Any, ClassVar
 4 | 
 5 | from ..analyzers.ab import ABAnalyzer
 6 | from ..comparators import Chi2Test, TTest, UTest
 7 | from ..dataset import Dataset, ExperimentData
 8 | from ..utils import ExperimentDataEnum
 9 | from .aa import OneAADictReporter
10 | 
11 | 
12 | class ABDictReporter(OneAADictReporter):
13 |     tests: ClassVar[list] = [TTest, UTest, Chi2Test]
14 | 
15 |     def extract_analyzer_data(self, data: ExperimentData) -> dict[str, Any]:
16 |         analyzer_id = data.get_one_id(ABAnalyzer, ExperimentDataEnum.analysis_tables)
17 |         return self.extract_from_one_row_dataset(data.analysis_tables[analyzer_id])
18 | 
19 |     def extract_data_from_analysis_tables(self, data: ExperimentData) -> dict[str, Any]:
20 |         result = {}
21 |         result.update(self.extract_group_sizes(data))
22 |         result.update(self.extract_group_difference(data))
23 |         result.update(self.extract_tests(data))
24 |         result.update(self.extract_analyzer_data(data))
25 |         return result
26 | 
27 |     def report(self, data: ExperimentData) -> dict[str, Any]:
28 |         return self.extract_data_from_analysis_tables(data)
29 | 
30 | 
31 | class ABDatasetReporter(ABDictReporter):
32 |     @staticmethod
33 |     def _invert_aa_format(table: Dataset) -> Dataset:
34 |         return table.replace("NOT OK", "N").replace("OK", "NOT OK").replace("N", "OK")
35 | 
36 |     def report(self, data: ExperimentData):
37 |         front_buffer = self.front
38 |         self.front = False
39 |         dict_report = super().report(data)
40 |         self.front = front_buffer
41 |         result = self.convert_flat_dataset(dict_report)
42 |         return self._invert_aa_format(result)
43 | 


--------------------------------------------------------------------------------
/hypex/dataset/__init__.py:
--------------------------------------------------------------------------------
 1 | """__init__.py for the dataset module in the HypEx library.
 2 | This module defines data structures and roles used across the library for managing and manipulating experimental data.
 3 | """
 4 | 
 5 | from .abstract import DatasetBase
 6 | from .dataset import Dataset, DatasetAdapter, ExperimentData
 7 | from .roles import (
 8 |                     ABCRole,
 9 |                     AdditionalGroupingRole,
10 |                     AdditionalMatchingRole,
11 |                     AdditionalPreTargetRole,
12 |                     AdditionalTargetRole,
13 |                     AdditionalTreatmentRole,
14 |                     ConstGroupRole,
15 |                     DefaultRole,
16 |                     FeatureRole,
17 |                     FilterRole,
18 |                     GroupingRole,
19 |                     InfoRole,
20 |                     PreTargetRole,
21 |                     StatisticRole,
22 |                     StratificationRole,
23 |                     TargetRole,
24 |                     TempGroupingRole,
25 |                     TempRole,
26 |                     TempTargetRole,
27 |                     TempTreatmentRole,
28 |                     TreatmentRole,
29 |                     default_roles,
30 | )
31 | 
32 | __all__ = [
33 |     "ABCRole",
34 |     "AdditionalGroupingRole",
35 |     "AdditionalMatchingRole",
36 |     "AdditionalPreTargetRole",
37 |     "AdditionalTargetRole",
38 |     "AdditionalTreatmentRole",
39 |     "ConstGroupRole",
40 |     "Dataset",
41 |     "DatasetAdapter",
42 |     "DatasetBase",
43 |     "DefaultRole",
44 |     "ExperimentData",
45 |     "FeatureRole",
46 |     "FilterRole",
47 |     "GroupingRole",
48 |     "InfoRole",
49 |     "PreTargetRole",
50 |     "StatisticRole",
51 |     "StratificationRole",
52 |     "TargetRole",
53 |     "TempGroupingRole",
54 |     "TempRole",
55 |     "TempTargetRole",
56 |     "TempTreatmentRole",
57 |     "TreatmentRole",
58 |     "default_roles",
59 | ]
60 | 


--------------------------------------------------------------------------------
/hypex/encoders/abstract.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Any, Sequence
 4 | 
 5 | from ..dataset import Dataset, ExperimentData, FeatureRole
 6 | from ..executor import Calculator
 7 | from ..utils import (
 8 |     NAME_BORDER_SYMBOL,
 9 |     AbstractMethodError,
10 |     CategoricalTypes,
11 |     ExperimentDataEnum,
12 | )
13 | 
14 | 
15 | class Encoder(Calculator):
16 |     def __init__(
17 |         self,
18 |         target_roles: str | Sequence[str] | None = None,
19 |         key: Any = "",
20 |     ):
21 |         self.target_roles = target_roles or FeatureRole()
22 |         self._key = key
23 |         super().__init__(key)
24 | 
25 |     @property
26 |     def __is_encoder(self):
27 |         return True
28 | 
29 |     @property
30 |     def search_types(self):
31 |         return [CategoricalTypes]
32 | 
33 |     def _get_ids(self, col_name):
34 |         self.key = f"{NAME_BORDER_SYMBOL}{col_name}{NAME_BORDER_SYMBOL}"
35 |         return self.id
36 | 
37 |     def _ids_to_names(self, col_names: list[str]):
38 |         return {col_name: self._get_ids(col_name) for col_name in col_names}
39 | 
40 |     @staticmethod
41 |     def _inner_function(data: Dataset, **kwargs) -> Dataset:
42 |         raise AbstractMethodError
43 | 
44 |     def _set_value(
45 |         self, data: ExperimentData, value: Dataset, key=None
46 |     ) -> ExperimentData:
47 |         return data.set_value(
48 |             space=ExperimentDataEnum.additional_fields,
49 |             executor_id=self._ids_to_names(value.columns),
50 |             value=value,
51 |             role=value.roles,
52 |         )
53 | 
54 |     def execute(self, data: ExperimentData) -> ExperimentData:
55 |         target_cols = data.ds.search_columns(
56 |             roles=self.target_roles, search_types=self.search_types
57 |         )
58 |         return self._set_value(
59 |             data=data,
60 |             value=self.calc(data=data.ds, target_cols=target_cols),
61 |             key=self.key,
62 |         )
63 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | min_version = 3.28.0
 3 | isolated_build = True
 4 | envlist =
 5 |     py{38,39,310,311,312},
 6 |     lint,
 7 |     docs,
 8 |     typing,
 9 |     build,
10 |     codespell
11 | 
12 | [tox:.package]
13 | basepython = python3
14 | 
15 | [gh-actions]
16 | python =
17 |     3.8: py38
18 |     3.9: py39
19 |     3.10: py310
20 |     3.11: py311
21 |     3.12: py312
22 | 
23 | [testenv]
24 | allowlist_externals = make
25 | package = wheel
26 | deps =
27 |     .[all]                  # Install all dependencies from pyproject.toml
28 |     pytest >= 6.2.5
29 |     psutil >= 7.0.0
30 |     alive-progress >= 3.1.0
31 |     jsonschema >= 4.23.0
32 | 
33 | 
34 | commands =
35 |     pytest {posargs} -v --basetemp="{envtmpdir}" --log-level=DEBUG
36 | 
37 | [testenv:lint]
38 | skip_install = true
39 | description = Lint code using Ruff
40 | deps =
41 |     ruff >= 0.3.0
42 | commands =
43 |     ruff check . --ignore RUF003,C901 --fix
44 | 
45 | [testenv:docs]
46 | description = Build documentation using Sphinx
47 | changedir = docs
48 | deps =
49 |     sphinx >= 5.3.0
50 |     sphinx-autodoc-typehints >= 1.19.5
51 |     sphinx-rtd-theme >= 1.1.1
52 |     nbsphinx >= 0.8.10
53 |     nbsphinx-link >= 1.3.0
54 |     doc8 >= 0.10.1
55 |     rstcheck >= 3.3.1
56 |     pandoc >= 2.0.1
57 |     IPython >= 7.0
58 | commands =
59 |     make clean html
60 |     doc8 . --ignore-path _autosummary --ignore-path _build --ignore-path _templates
61 | 
62 | [testenv:typing]
63 | skip_install = true
64 | description = Run type checks with mypy
65 | deps =
66 |     mypy >= 0.991
67 |     pandas-stubs
68 |     pytest
69 |     types-pytz
70 |     types-tqdm
71 |     types-requests
72 |     types-pyyaml
73 | commands =
74 |     mypy {posargs:. tests}
75 | 
76 | [testenv:build]
77 | description = Build the project using Poetry
78 | deps =
79 |     poetry >= 1.1.7
80 | commands =
81 |     poetry build
82 | 
83 | [testenv:codespell]
84 | skip_install = true
85 | description = Check for spelling errors
86 | deps =
87 |     codespell >= 2.3.0
88 | commands =
89 |     codespell --skip="docs,_build,imgs,schemes,poetry.lock" --ignore-words-list="dotA,TE"


--------------------------------------------------------------------------------
/hypex/extensions/abstract.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from typing import Any, Literal
 5 | 
 6 | from ..dataset import ABCRole, Dataset
 7 | from ..dataset.backends import PandasDataset
 8 | from ..dataset.dataset import DatasetAdapter
 9 | from ..utils.errors import AbstractMethodError
10 | 
11 | 
12 | class Extension(ABC):
13 |     def __init__(self):
14 |         self.BACKEND_MAPPING = {
15 |             PandasDataset: self._calc_pandas,
16 |         }
17 | 
18 |     @abstractmethod
19 |     def _calc_pandas(self, data: Dataset, **kwargs):
20 |         raise AbstractMethodError
21 | 
22 |     def calc(self, data: Dataset, **kwargs):
23 |         return self.BACKEND_MAPPING[type(data.backend)](data=data, **kwargs)
24 | 
25 |     @staticmethod
26 |     def result_to_dataset(result: Any, roles: ABCRole | dict[str, ABCRole]) -> Dataset:
27 |         return DatasetAdapter.to_dataset(result, roles=roles)
28 | 
29 | 
30 | class CompareExtension(Extension, ABC):
31 |     def calc(self, data: Dataset, other: Dataset | None = None, **kwargs):
32 |         return super().calc(data=data, other=other, **kwargs)
33 | 
34 | 
35 | class MLExtension(Extension):
36 |     #   TODO: add model
37 |     def _calc_pandas(
38 |         self,
39 |         data: Dataset,
40 |         test_data: Dataset | None = None,
41 |         mode: Literal["auto", "fit", "predict"] | None = None,
42 |         **kwargs,
43 |     ):
44 |         if mode in ["auto", "fit"]:
45 |             return self.fit(data, test_data, **kwargs)
46 |         return self.predict(data)
47 | 
48 |     @abstractmethod
49 |     def fit(self, X, Y=None, **kwargs):
50 |         raise NotImplementedError
51 | 
52 |     @abstractmethod
53 |     def predict(self, X, **kwargs):
54 |         raise NotImplementedError
55 | 
56 |     def calc(
57 |         self,
58 |         data: Dataset,
59 |         target_data: Dataset | None = None,
60 |         test_data: Dataset | None = None,
61 |         **kwargs,
62 |     ):
63 |         return super().calc(
64 |             data=data, target_data=target_data, test_data=test_data, **kwargs
65 |         )
66 | 


--------------------------------------------------------------------------------
/hypex/transformers/category_agg.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Any, Sequence
 4 | 
 5 | from ..dataset.dataset import Dataset, ExperimentData
 6 | from ..dataset.roles import FeatureRole
 7 | from ..utils import CategoricalTypes
 8 | from ..utils.adapter import Adapter
 9 | from .abstract import Transformer
10 | 
11 | 
12 | class CategoryAggregator(Transformer):
13 |     def __init__(
14 |         self,
15 |         target_roles: str | Sequence[str] | None = None,
16 |         threshold: int | None = 15,
17 |         new_group_name: str | None = None,
18 |         key: Any = "",
19 |     ):
20 |         super().__init__(key=key)
21 |         self.target_roles = target_roles or FeatureRole()
22 |         self.threshold = threshold
23 |         self.new_group_name = new_group_name
24 | 
25 |     @property
26 |     def search_types(self):
27 |         return [CategoricalTypes]
28 | 
29 |     @staticmethod
30 |     def _inner_function(
31 |         data: Dataset,
32 |         target_cols: str | None = None,
33 |         threshold: int | None = 15,
34 |         new_group_name: str | None = None,
35 |     ) -> Dataset:
36 |         target_cols = Adapter.to_list(target_cols)
37 |         for column in target_cols:
38 |             categories_counts = data[column].value_counts()
39 |             values_to_replace = categories_counts[
40 |                 categories_counts["count"] < threshold
41 |             ][column].get_values(column=column)
42 |             data[column] = data[column].replace(
43 |                 to_replace=values_to_replace, value=new_group_name
44 |             )
45 | 
46 |         return data
47 | 
48 |     def execute(self, data: ExperimentData) -> ExperimentData:
49 |         target_cols = data.ds.search_columns(
50 |             roles=self.target_roles, search_types=self.search_types
51 |         )
52 |         result = data.copy(
53 |             data=self.calc(
54 |                 data=data.ds,
55 |                 target_cols=target_cols,
56 |                 threshold=self.threshold,
57 |                 new_group_name=self.new_group_name,
58 |             )
59 |         )
60 |         return result
61 | 


--------------------------------------------------------------------------------
/hypex/ui/aa.py:
--------------------------------------------------------------------------------
 1 | from ..analyzers.aa import AAScoreAnalyzer
 2 | from ..dataset import Dataset, ExperimentData
 3 | from ..reporters.aa import AABestSplitReporter, AAPassedReporter
 4 | from ..utils import ExperimentDataEnum
 5 | from ..utils.enums import RenameEnum
 6 | from .base import Output
 7 | 
 8 | 
 9 | class AAOutput(Output):
10 |     best_split: Dataset
11 |     experiments: Dataset
12 |     aa_score: Dataset
13 |     best_split_statistic: Dataset
14 | 
15 |     def __init__(self):
16 |         super().__init__(
17 |             resume_reporter=AAPassedReporter(),
18 |             additional_reporters={"best_split": AABestSplitReporter()},
19 |         )
20 | 
21 |     def _extract_experiments(self, experiment_data: ExperimentData):
22 |         id_ = experiment_data.get_one_id(
23 |             "ParamsExperiment", ExperimentDataEnum.analysis_tables, "AATest"
24 |         )
25 |         self.experiments = self._replace_splitters(
26 |             experiment_data.analysis_tables[id_], RenameEnum.columns
27 |         )
28 | 
29 |     def _extract_aa_score(self, experiment_data: ExperimentData):
30 |         def get_analyzer_id(key: str):
31 |             target_id = [i for i in aa_score_analyser_ids if i.endswith(key)]
32 |             if len(target_id):
33 |                 return target_id[0]
34 |             else:
35 |                 raise ValueError("Result of AAScoreAnalyzer does not found.")
36 | 
37 |         aa_score_analyser_ids = experiment_data.get_ids(
38 |             AAScoreAnalyzer, ExperimentDataEnum.analysis_tables
39 |         )[AAScoreAnalyzer.__name__][ExperimentDataEnum.analysis_tables.value]
40 | 
41 |         self.aa_score = experiment_data.analysis_tables[get_analyzer_id("aa score")]
42 |         self.aa_score = self._replace_splitters(self.aa_score, RenameEnum.index)
43 | 
44 |         self.best_split_statistic = experiment_data.analysis_tables[
45 |             get_analyzer_id("best split statistics")
46 |         ]
47 | 
48 |     def extract(self, experiment_data: ExperimentData):
49 |         super().extract(experiment_data)
50 |         self._extract_experiments(experiment_data)
51 |         self._extract_aa_score(experiment_data)
52 | 


--------------------------------------------------------------------------------
/hypex/comparators/hypothesis_testing.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from ..dataset import Dataset
 4 | from ..extensions.scipy_stats import (
 5 |     Chi2TestExtension,
 6 |     KSTestExtension,
 7 |     TTestExtension,
 8 |     UTestExtension,
 9 | )
10 | from ..utils.constants import NUMBER_TYPES_LIST
11 | from .abstract import StatHypothesisTesting
12 | 
13 | 
14 | class TTest(StatHypothesisTesting):
15 |     @property
16 |     def search_types(self) -> list[type] | None:
17 |         return NUMBER_TYPES_LIST
18 | 
19 |     @classmethod
20 |     def _inner_function(
21 |         cls, data: Dataset, test_data: Dataset | None = None, **kwargs
22 |     ) -> Dataset:
23 |         return TTestExtension(kwargs.get("reliability", 0.05)).calc(
24 |             data, other=test_data, **kwargs
25 |         )
26 | 
27 | 
28 | class KSTest(StatHypothesisTesting):
29 |     @property
30 |     def search_types(self) -> list[type] | None:
31 |         return NUMBER_TYPES_LIST
32 | 
33 |     @classmethod
34 |     def _inner_function(
35 |         cls, data: Dataset, test_data: Dataset | None = None, **kwargs
36 |     ) -> Dataset:
37 |         return KSTestExtension(kwargs.get("reliability", 0.05)).calc(
38 |             data, other=test_data, **kwargs
39 |         )
40 | 
41 | 
42 | class UTest(StatHypothesisTesting):
43 |     @property
44 |     def search_types(self) -> list[type] | None:
45 |         return NUMBER_TYPES_LIST
46 | 
47 |     @classmethod
48 |     def _inner_function(
49 |         cls, data: Dataset, test_data: Dataset | None = None, **kwargs
50 |     ) -> Dataset:
51 |         return UTestExtension(kwargs.get("reliability", 0.05)).calc(
52 |             data, other=test_data, **kwargs
53 |         )
54 | 
55 | 
56 | class Chi2Test(StatHypothesisTesting):
57 |     @property
58 |     def search_types(self) -> list[type] | None:
59 |         return [str]
60 | 
61 |     @classmethod
62 |     def _inner_function(
63 |         cls, data: Dataset, test_data: Dataset | None = None, **kwargs
64 |     ) -> Dataset:
65 |         return Chi2TestExtension(reliability=kwargs.get("reliability", 0.05)).calc(
66 |             data, other=test_data, **kwargs
67 |         )
68 | 


--------------------------------------------------------------------------------
/hypex/transformers/na_filler.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Any, Literal, Sequence
 4 | 
 5 | from ..dataset.dataset import Dataset, ExperimentData
 6 | from ..dataset.roles import FeatureRole
 7 | from ..utils import ScalarType
 8 | from ..utils.adapter import Adapter
 9 | from .abstract import Transformer
10 | 
11 | 
12 | class NaFiller(Transformer):
13 |     def __init__(
14 |         self,
15 |         target_roles: str | Sequence[str] | None = None,
16 |         values: ScalarType | dict[str, ScalarType] | None = None,
17 |         method: Literal["bfill", "ffill"] | None = None,
18 |         key: Any = "",
19 |     ):
20 |         """
21 |         Initializes a NaFiller object.
22 | 
23 |         Args:
24 |             target_roles (Optional[Union[str, Sequence[str]]], optional): The roles of the target columns. Defaults to None.
25 |             key (Any, optional): The key for the NaFiller object. Defaults to "".
26 |             values (Union[ScalarType, Dict[str, ScalarType]], optional): The values to fill missing values with. Defaults to None.
27 |             method (Literal["bfill", "ffill"], optional): The method to fill missing values. Defaults to None.
28 | 
29 |         Returns:
30 |             None
31 |         """
32 | 
33 |         super().__init__(key=key)
34 |         self.target_roles = target_roles or FeatureRole()
35 |         self.values = values
36 |         self.method = method
37 | 
38 |     @staticmethod
39 |     def _inner_function(
40 |         data: Dataset,
41 |         target_cols: str | None = None,
42 |         values: ScalarType | dict[str, ScalarType] | None = None,
43 |         method: Literal["bfill", "ffill"] | None = None,
44 |     ) -> Dataset:
45 |         target_cols = Adapter.to_list(target_cols)
46 |         for column in target_cols:
47 |             value = values[column] if isinstance(values, dict) else values
48 |             data[column] = data[column].fillna(values=value, method=method)
49 |         return data
50 | 
51 |     def execute(self, data: ExperimentData) -> ExperimentData:
52 |         target_cols = data.ds.search_columns(roles=self.target_roles)
53 |         result = data.copy(
54 |             data=self.calc(
55 |                 data=data.ds,
56 |                 target_cols=target_cols,
57 |                 values=self.values,
58 |                 method=self.method,
59 |             )
60 |         )
61 |         return result
62 | 


--------------------------------------------------------------------------------
/hypex/homogeneity.py:
--------------------------------------------------------------------------------
 1 | from .analyzers.aa import OneAAStatAnalyzer
 2 | from .comparators import Chi2Test, GroupDifference, GroupSizes, KSTest, TTest
 3 | from .dataset import TargetRole, TreatmentRole
 4 | from .experiments.base import Experiment, OnRoleExperiment
 5 | from .ui.base import ExperimentShell
 6 | from .ui.homo import HomoOutput
 7 | 
 8 | HOMOGENEITY_TEST = Experiment(
 9 |     executors=[
10 |         OnRoleExperiment(
11 |             executors=[
12 |                 GroupSizes(grouping_role=TreatmentRole(), compare_by="groups"),
13 |                 GroupDifference(grouping_role=TreatmentRole(), compare_by="groups"),
14 |                 TTest(grouping_role=TreatmentRole(), compare_by="groups"),
15 |                 KSTest(grouping_role=TreatmentRole(), compare_by="groups"),
16 |                 Chi2Test(grouping_role=TreatmentRole(), compare_by="groups"),
17 |             ],
18 |             role=TargetRole(),
19 |         ),
20 |         OneAAStatAnalyzer(),
21 |     ]
22 | )
23 | 
24 | 
25 | class HomogeneityTest(ExperimentShell):
26 |     """A class for conducting homogeneity tests between the groups.
27 | 
28 |     This class provides functionality to test whether treatment and control groups are
29 |     homogeneous across target variables using multiple statistical tests including t-test,
30 |     Kolmogorov-Smirnov test, and chi-square test.
31 | 
32 |     The class runs the following analyses:
33 |         - Group size comparisons
34 |         - Group differences
35 |         - T-test for continuous variables
36 |         - KS-test for distribution comparisons
37 |         - Chi-square test for categorical variables
38 |         - AA statistics analysis
39 | 
40 |     Examples
41 |     --------
42 |     .. code-block:: python
43 | 
44 |         # Basic homogeneity test
45 |         homo_test = HomogeneityTest()
46 |         results = homo_test.execute(data)
47 | 
48 |         # Accessing specific test results
49 |         homo_test = HomogeneityTest()
50 |         results = homo_test.execute(data)
51 |         output = results.resume
52 | 
53 |         # Running test on dataset with roles
54 |         from hypex.dataset import Dataset, TargetRole, TreatmentRole
55 |         ds = Dataset(
56 |             roles={
57 |                 'treatment': TreatmentRole(),
58 |                 'outcome': TargetRole()
59 |             },
60 |             data=df
61 |         )
62 |         homo_test = HomogeneityTest()
63 |         results = homo_test.execute(ds)
64 |     """
65 | 
66 |     def __init__(self):
67 |         """Initialize HomogeneityTest with default experiment and output configurations."""
68 |         super().__init__(
69 |             experiment=HOMOGENEITY_TEST,
70 |             output=HomoOutput(),
71 |         )
72 | 


--------------------------------------------------------------------------------
/hypex/ui/ab.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | from ..analyzers.ab import ABAnalyzer
 4 | from ..comparators import GroupDifference, GroupSizes
 5 | from ..dataset import Dataset, ExperimentData, StatisticRole, TreatmentRole
 6 | from ..reporters.ab import ABDatasetReporter
 7 | from ..utils import ID_SPLIT_SYMBOL, ExperimentDataEnum
 8 | from .base import Output
 9 | 
10 | 
11 | class ABOutput(Output):
12 |     multitest: Union[Dataset, str]
13 |     sizes: Dataset
14 | 
15 |     def __init__(self):
16 |         self._groups = []
17 |         super().__init__(resume_reporter=ABDatasetReporter())
18 | 
19 |     def _extract_multitest_result(self, experiment_data: ExperimentData):
20 |         multitest_id = experiment_data.get_one_id(
21 |             ABAnalyzer, ExperimentDataEnum.analysis_tables
22 |         )
23 |         if multitest_id and "MultiTest" in multitest_id:
24 |             self.multitest = experiment_data.analysis_tables[multitest_id]
25 |         else:
26 |             self.multitest = (
27 |                 "There was less than three groups or multitest method wasn't provided"
28 |             )
29 | 
30 |     def _extract_differences(self, experiment_data: ExperimentData):
31 |         targets = []
32 |         groups = []
33 |         ids = experiment_data.get_ids(
34 |             GroupDifference,
35 |             searched_space=ExperimentDataEnum.analysis_tables,
36 |         )["GroupDifference"]["analysis_tables"]
37 |         self._groups = list(
38 |             experiment_data.groups[
39 |                 experiment_data.ds.search_columns(TreatmentRole())[0]
40 |             ].keys()
41 |         )[1:]
42 |         for i in self._groups:
43 |             groups += [i] * len(ids)
44 |         diff = Dataset.create_empty()
45 |         for i in range(len(ids)):
46 |             diff = diff.append(experiment_data.analysis_tables[ids[i]])
47 |             targets += [ids[i].split(ID_SPLIT_SYMBOL)[-1]]
48 |         return diff.add_column(groups, role={"group": StatisticRole()}).add_column(
49 |             targets * len(self._groups), role={"feature": StatisticRole()}
50 |         )
51 | 
52 |     def _extract_sizes(self, experiment_data: ExperimentData):
53 |         ids = experiment_data.get_ids(
54 |             GroupSizes,
55 |             searched_space=ExperimentDataEnum.analysis_tables,
56 |         )["GroupSizes"]["analysis_tables"]
57 |         self.sizes = experiment_data.analysis_tables[ids[0]].add_column(
58 |             self._groups, role={"group": StatisticRole()}
59 |         )
60 | 
61 |     def extract(self, experiment_data: ExperimentData):
62 |         super().extract(experiment_data)
63 |         self._extract_differences(experiment_data)
64 |         self._extract_multitest_result(experiment_data)
65 |         self._extract_sizes(experiment_data)
66 | 


--------------------------------------------------------------------------------
/hypex/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .constants import (
 2 |                         ID_SPLIT_SYMBOL,
 3 |                         MATCHING_INDEXES_SPLITTER_SYMBOL,
 4 |                         NAME_BORDER_SYMBOL,
 5 |                         NUMBER_TYPES_LIST,
 6 | )
 7 | from .enums import ABNTestMethodsEnum, BackendsEnum, ExperimentDataEnum, SpaceEnum
 8 | from .errors import (
 9 |                         AbstractMethodError,
10 |                         BackendTypeError,
11 |                         ConcatBackendError,
12 |                         ConcatDataError,
13 |                         DataTypeError,
14 |                         MergeOnError,
15 |                         NoColumnsError,
16 |                         NoRequiredArgumentError,
17 |                         NotFoundInExperimentDataError,
18 |                         NotSuitableFieldError,
19 |                         RoleColumnError,
20 |                         SpaceError,
21 | )
22 | from .tutorial_data_creation import (
23 |                         create_test_data,
24 |                         gen_control_variates_df,
25 |                         gen_oracle_df,
26 |                         gen_special_medicine_df,
27 | )
28 | from .typings import (
29 |                         CategoricalTypes,
30 |                         DecoratedType,
31 |                         DefaultRoleTypes,
32 |                         DocstringInheritDecorator,
33 |                         FromDictTypes,
34 |                         GroupingDataType,
35 |                         MultiFieldKeyTypes,
36 |                         RoleNameType,
37 |                         ScalarType,
38 |                         SetParamsDictTypes,
39 |                         StratificationRoleTypes,
40 |                         TargetRoleTypes,
41 | )
42 | 
43 | __all__ = [
44 |     "ID_SPLIT_SYMBOL",
45 |     "MATCHING_INDEXES_SPLITTER_SYMBOL",
46 |     "NAME_BORDER_SYMBOL",
47 |     "NUMBER_TYPES_LIST",
48 |     "ABNTestMethodsEnum",
49 |     "AbstractMethodError",
50 |     "BackendTypeError",
51 |     "BackendsEnum",
52 |     "CategoricalTypes",
53 |     "ConcatBackendError",
54 |     "ConcatDataError",
55 |     "DataTypeError",
56 |     "DecoratedType",
57 |     "DefaultRoleTypes",
58 |     "DocstringInheritDecorator",
59 |     "ExperimentDataEnum",
60 |     "FromDictTypes",
61 |     "GroupingDataType",
62 |     "MergeOnError",
63 |     "MultiFieldKeyTypes",
64 |     "NoColumnsError",
65 |     "NoRequiredArgumentError",
66 |     "NotFoundInExperimentDataError",
67 |     "NotSuitableFieldError",
68 |     "RoleColumnError",
69 |     "RoleNameType",
70 |     "ScalarType",
71 |     "SetParamsDictTypes",
72 |     "SpaceEnum",
73 |     "SpaceError",
74 |     "StratificationRoleTypes",
75 |     "TargetRoleTypes",
76 |     "create_test_data",
77 |     "gen_control_variates_df",
78 |     "gen_oracle_df",
79 |     "gen_special_medicine_df",
80 | ]
81 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "HypEx"
 3 | version = "1.0.2"
 4 | description = "Fast and customizable framework for Causal Inference"
 5 | authors = [
 6 |     "Dmitry Tikhomirov <dimasta00@gmail.com>",
 7 |     "Dmitry Bulychev <dmatryus.sqrt49@yandex.ru>",
 8 |     "Ivan Yurashku <yurashku@gmail.com>",
 9 |     "Anton Katkov <akatkov89@gmail.com>",
10 |     "Ruslan Alsherov <ruslan-alsherov@yandex.ru>",
11 |     "Ksenia Vasilieva <vasilievaxeniaa@gmail.com>",
12 |     "Anastasiia Fedorova <fedorovanasty24@gmail.com>"
13 | ]
14 | readme = "README.md"
15 | license = "Apache-2.0"
16 | repository = "https://github.com/sb-ai-lab/HypEx"
17 | classifiers = [
18 |     "Programming Language :: Python :: 3 :: Only",
19 |     "Programming Language :: Python :: 3.8",
20 |     "Programming Language :: Python :: 3.9",
21 |     "Programming Language :: Python :: 3.10",
22 |     "Programming Language :: Python :: 3.11",
23 |     "Programming Language :: Python :: 3.12",
24 |     "Operating System :: OS Independent",
25 |     "Intended Audience :: Science/Research",
26 |     "Development Status :: 4 - Beta",
27 |     "Environment :: Console",
28 |     "Natural Language :: English",
29 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
30 |     "Topic :: Scientific/Engineering :: Mathematics",
31 |     "Typing :: Typed",
32 | ]
33 | 
34 | [tool.poetry.dependencies]
35 | python = ">=3.8, <3.13"
36 | tqdm = "*"
37 | scikit-learn = "*"
38 | 
39 | pandas = [
40 |     { version = ">=1.3.5, <=2.0.3", python = "<3.9" },
41 |     { version = ">=1.3.5, <=2.2.3", python = ">=3.9" }
42 | ]
43 | 
44 | numpy = [
45 |     { version = ">=1.17.0, <=1.24.4", python = "<3.9" },
46 |     { version = ">=1.17.0, <=1.26.4", python = ">=3.9" }
47 | ]
48 | 
49 | scipy = [
50 |     { version = ">=1.5.0, <=1.10.1", python = "<3.9" },
51 |     { version = ">=1.5.0, <=1.13.1", python = ">=3.9" }
52 | ]
53 | 
54 | matplotlib = [
55 |     { version = ">=3.0.0, <=3.7.3", python = "<3.9" },
56 |     { version = ">=3.0.0, <=3.9.0", python = ">=3.9" }
57 | ]
58 | 
59 | faiss-cpu = ">=1.6.0, <=1.8.0"
60 | seaborn = "<=0.13.2"
61 | statsmodels = "<=0.14.2"
62 | 
63 | [tool.poetry.extras]
64 | cat = ["catboost"]
65 | lgbm = ["lightgbm"]
66 | all = ["catboost", "lightgbm"]
67 | 
68 | [tool.poetry.group.dev.dependencies]
69 | docutils = ">=0.17,<0.21"
70 | jupyter = "^1.0.0"
71 | pytest = "^7.4.3"
72 | sphinx = { version = "^7.2.6", python = ">=3.9, <3.11" }
73 | nbsphinx = "*"
74 | nbsphinx_link = "*"
75 | sphinx_autodoc-typehints = "*"
76 | sphinx_rtd_theme = "^1.2.2"
77 | ruff = "*"
78 | alive-progress = "^3.1.0"
79 | psutil = "^7.0.0"
80 | jsonschema = "^4.23.0"
81 | 
82 | 
83 | [build-system]
84 | requires = ["poetry-core"]
85 | build-backend = "poetry.core.masonry.api"
86 | 
87 | [tool.ruff]
88 | line-length = 88
89 | target-version = "py38"
90 | 
91 | [tool.ruff.lint]
92 | select = ["E", "F", "W", "C90", "UP", "RUF", "I"]
93 | ignore = ["E501", "RUF003", "C901"]
94 | 


--------------------------------------------------------------------------------
/hypex/extensions/faiss.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Literal
 4 | 
 5 | import faiss  # type: ignore
 6 | import numpy as np
 7 | import pandas as pd  # type: ignore
 8 | 
 9 | from ..dataset import AdditionalMatchingRole, Dataset
10 | from .abstract import MLExtension
11 | 
12 | 
13 | class FaissExtension(MLExtension):
14 |     def __init__(
15 |         self, n_neighbors: int = 1, faiss_mode: Literal["base", "fast", "auto"] = "auto"
16 |     ):
17 |         self.n_neighbors = n_neighbors
18 |         self.faiss_mode = faiss_mode
19 |         super().__init__()
20 | 
21 |     @staticmethod
22 |     def _prepare_indexes(index: np.ndarray, dist: np.ndarray, k: int):
23 |         new = [
24 |             np.concatenate(
25 |                 [val[np.where(dist[i] == d)[0]] for d in sorted(set(dist[i]))[:k]]
26 |             )
27 |             for i, val in enumerate(index)
28 |         ]
29 |         return new
30 | 
31 |     def _predict(self, data: Dataset, test_data: Dataset, X: np.ndarray) -> pd.Series:
32 |         dist, indexes = self.index.search(X, k=self.n_neighbors)
33 |         if self.n_neighbors == 1:
34 |             equal_dist = list(map(lambda x: np.where(x == x[0])[0], dist))
35 |             indexes = [
36 |                 (
37 |                     int(index[dist][0])
38 |                     if abs(index[dist][0]) <= len(data) + len(test_data)
39 |                     else -1
40 |                 )
41 |                 for index, dist in zip(indexes, equal_dist)
42 |             ]
43 |         else:
44 |             indexes = self._prepare_indexes(indexes, dist, self.n_neighbors)
45 |         return pd.Series(indexes)
46 | 
47 |     def _calc_pandas(
48 |         self,
49 |         data: Dataset,
50 |         test_data: Dataset | None = None,
51 |         mode: Literal["auto", "fit", "predict"] | None = None,
52 |         **kwargs,
53 |     ):
54 |         mode = mode or "auto"
55 |         X = data.data.values
56 |         test = test_data.data.values
57 |         if mode in ["auto", "fit"]:
58 |             self.index = faiss.IndexFlatL2(X.shape[1])
59 |             if ((
60 |                 len(X) > 1_000_000 and self.faiss_mode == "auto"
61 |             ) or self.faiss_mode == "fast"
62 |             ) and len(X) > 1_000 and len(test) > 1_000:
63 |                 self.index = faiss.IndexIVFFlat(self.index, X.shape[1], 1000)
64 |                 self.index.train(X)
65 |             self.index.add(X)
66 |         if mode in ["auto", "predict"]:
67 |             if test_data is None:
68 |                 raise ValueError("test_data is needed for evaluation")
69 |             X = test_data.data.values if mode == "auto" else data.data.values
70 |             return self._predict(data, test_data, X)
71 |         return self
72 | 
73 |     def fit(self, X: Dataset, Y: Dataset | None = None, **kwargs):
74 |         return super().calc(X, target_data=Y, mode="fit", **kwargs)
75 | 
76 |     def predict(self, X: Dataset, **kwargs) -> Dataset:
77 |         return self.result_to_dataset(
78 |             super().calc(X, mode="predict", **kwargs), AdditionalMatchingRole()
79 |         )
80 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Files
  2 | *.csv
  3 | *.png
  4 | *.pickle
  5 | 
  6 | # Byte-compiled / optimized / DLL files
  7 | __pycache__/
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # DS_store
 15 | .DS_Store
 16 | 
 17 | # Distribution / packaging
 18 | .Python
 19 | build/
 20 | develop-eggs/
 21 | dist/
 22 | downloads/
 23 | eggs/
 24 | .eggs/
 25 | lib/
 26 | lib64/
 27 | parts/
 28 | sdist/
 29 | var/
 30 | wheels/
 31 | share/python-wheels/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | MANIFEST
 36 | tabularAutoML_model_report/
 37 | 
 38 | # PyInstaller
 39 | #  Usually these files are written by a python script from a template
 40 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 41 | *.manifest
 42 | *.spec
 43 | 
 44 | # Installer logs
 45 | pip-log.txt
 46 | pip-delete-this-directory.txt
 47 | 
 48 | # Unit test / coverage reports
 49 | htmlcov/
 50 | .tox/
 51 | .nox/
 52 | .coverage
 53 | .coverage.*
 54 | .cache
 55 | nosetests.xml
 56 | coverage.xml
 57 | *.cover
 58 | *.py,cover
 59 | .hypothesis/
 60 | .pytest_cache/
 61 | cover/
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | *.log
 69 | local_settings.py
 70 | db.sqlite3
 71 | db.sqlite3-journal
 72 | 
 73 | # Flask stuff:
 74 | instance/
 75 | .webassets-cache
 76 | 
 77 | # Scrapy stuff:
 78 | .scrapy
 79 | 
 80 | # Sphinx documentation
 81 | docs/_build/
 82 | docs/_autosummary/
 83 | 
 84 | # PyBuilder
 85 | .pybuilder/
 86 | target/
 87 | 
 88 | # Jupyter Notebook
 89 | .ipynb_checkpoints
 90 | 
 91 | # IPython
 92 | profile_default/
 93 | ipython_config.py
 94 | 
 95 | # pyenv
 96 | #   For a library or package, you might want to ignore these files since the code is
 97 | #   intended to run in multiple environments; otherwise, check them in:
 98 | # .python-version
 99 | 
100 | # pipenv
101 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
102 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
103 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
104 | #   install all needed dependencies.
105 | #Pipfile.lock
106 | 
107 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
108 | __pypackages__/
109 | 
110 | # Celery stuff
111 | celerybeat-schedule
112 | celerybeat.pid
113 | 
114 | # SageMath parsed files
115 | *.sage.py
116 | 
117 | # Environments
118 | .env
119 | .venv
120 | env/
121 | venv/
122 | ENV/
123 | env.bak/
124 | venv.bak/
125 | 
126 | # Spyder project settings
127 | .spyderproject
128 | .spyproject
129 | 
130 | # Rope project settings
131 | .ropeproject
132 | 
133 | # VSCode
134 | .vscode
135 | 
136 | # mkdocs documentation
137 | /site
138 | 
139 | # mypy
140 | .mypy_cache/
141 | .dmypy.json
142 | dmypy.json
143 | 
144 | # Pyre type checker
145 | .pyre/
146 | 
147 | # pytype static type analyzers
148 | .pytype/
149 | 
150 | # Cython debug symbols
151 | cython_debug/
152 | 
153 | # VSCode
154 | .vscode/
155 | 
156 | .idea/
157 | lama_venv/
158 | *.db
159 | 
160 | temp/
161 | 
162 | poetry.lock
163 | 


--------------------------------------------------------------------------------
/hypex/factory/base.py:
--------------------------------------------------------------------------------
 1 | # import sys
 2 | #
 3 | # from ..analyzers import ABAnalyzer, OneAAStatAnalyzer
 4 | # from ..comparators import GroupDifference, GroupSizes, ATE, TTest, KSTest, UTest
 5 | # from ..dataset import (
 6 | #     ExperimentData,
 7 | #     Arg1Role,
 8 | #     Arg2Role,
 9 | #     InfoRole,
10 | #     TargetRole,
11 | #     FeatureRole,
12 | #     GroupingRole,
13 | #     PreTargetRole,
14 | #     StatisticRole,
15 | #     StratificationRole,
16 | #     TreatmentRole,
17 | #     TempTreatmentRole,
18 | #     TempGroupingRole,
19 | #     TempTargetRole,
20 | # )
21 | # from ..experiments import (
22 | #     Experiment,
23 | #     OnRoleExperiment,
24 | #     GroupExperiment,
25 | #     CycledExperiment,
26 | # )
27 | # from ..reporters import OneAADictReporter
28 | # from ..transformers import Shuffle
29 | # from ..utils import ExperimentDataEnum, SpaceEnum
30 | #
31 | # all_classes = [
32 | #     ABAnalyzer,
33 | #     OneAAStatAnalyzer,
34 | #     GroupDifference,
35 | #     GroupSizes,
36 | #     ATE,
37 | #     TTest,
38 | #     KSTest,
39 | #     UTest,
40 | #     Arg1Role,
41 | #     Arg2Role,
42 | #     InfoRole,
43 | #     TargetRole,
44 | #     FeatureRole,
45 | #     GroupingRole,
46 | #     PreTargetRole,
47 | #     StatisticRole,
48 | #     StratificationRole,
49 | #     TreatmentRole,
50 | #     TempTreatmentRole,
51 | #     TempGroupingRole,
52 | #     TempTargetRole,
53 | #     OnRoleExperiment,
54 | #     GroupExperiment,
55 | #     CycledExperiment,
56 | #     OneAADictReporter,
57 | #     Shuffle,
58 | #     ExperimentDataEnum,
59 | #     SpaceEnum,
60 | # ]
61 | #
62 | # spaces = {
63 | #     "additional": SpaceEnum.additional,
64 | #     "auto": SpaceEnum.auto,
65 | #     "data": SpaceEnum.data,
66 | # }
67 | #
68 | #
69 | # class Factory:
70 | #     def __init__(self, hypothesis):
71 | #         self.hypothesis = hypothesis
72 | #
73 | #     def make_experiment(self, experiment):
74 | #         executors = []
75 | #         for key, items in experiment.items():
76 | #             class_ = getattr(sys.modules[__name__], key)
77 | #             if "executors" in items or "inner_executors" in items:
78 | #                 item = "executors" if "executors" in items else "inner_executors"
79 | #                 items[f"{item}"] = self.make_experiment(experiment[key][f"{item}"][0])
80 | #             if "role" in items or "grouping_role" in items:
81 | #                 item = "role" if "role" in items else "grouping_role"
82 | #                 items[f"{item}"] = getattr(
83 | #                     sys.modules[__name__], items[item] + "Role"
84 | #                 )()
85 | #             if "space" in items:
86 | #                 items["space"] = spaces.get(items["space"])
87 | #             items = {i: None if j == "None" else j for i, j in items.items()}
88 | #             executors.append(class_(**items))
89 | #         return executors
90 | #
91 | #     def execute(self):
92 | #         experiment_data = ExperimentData(self.hypothesis.dataset)
93 | #         experiment = Experiment(
94 | #             executors=self.make_experiment(self.hypothesis.experiment)
95 | #         )
96 | #         return experiment_data, experiment
97 | 


--------------------------------------------------------------------------------
/hypex/utils/errors.py:
--------------------------------------------------------------------------------
 1 | from typing import Literal
 2 | 
 3 | 
 4 | class RoleColumnError(Exception):
 5 |     def __init__(self, roles, columns):
 6 |         super().__init__(
 7 |             "Check your roles. All of them must be names of data columns. \n"
 8 |             f"Now roles have {roles} values and columns have {columns} values"
 9 |         )
10 | 
11 | 
12 | class ConcatDataError(Exception):
13 |     def __init__(self, data_type):
14 |         super().__init__(f"Can only append Dataset to Dataset. Got {data_type}")
15 | 
16 | 
17 | class ConcatBackendError(Exception):
18 |     def __init__(self, other_backend, backend):
19 |         super().__init__(
20 |             f"Can only append data with the same backends. Got {other_backend} expected {backend}"
21 |         )
22 | 
23 | 
24 | class SpaceError(Exception):
25 |     def __init__(self, space):
26 |         super().__init__(f"{space} is not a valid space")
27 | 
28 | 
29 | class NoColumnsError(Exception):
30 |     def __init__(self, role):
31 |         super().__init__(f"No columns found by role {role}")
32 | 
33 | 
34 | class NotSuitableFieldError(Exception):
35 |     def __init__(self, field, field_role: Literal["Grouping", "Target", "Baseline"]):
36 |         super().__init__(f"{field_role} field {field} is not suitable for comparison")
37 | 
38 | 
39 | class NotFoundInExperimentDataError(Exception):
40 |     def __init__(self, class_: str):
41 |         super().__init__(f"{class_} id is not found in ExperimentData")
42 | 
43 | 
44 | class AbstractMethodError(NotImplementedError):
45 |     def __init__(self):
46 |         super().__init__(
47 |             "This method is abstract and will be overridden in derived class."
48 |         )
49 | 
50 | 
51 | class DataTypeError(Exception):
52 |     def __init__(self, data_type):
53 |         super().__init__(
54 |             f"Can only perform the operation for Dataset and Dataset. Got {data_type}"
55 |         )
56 | 
57 | 
58 | class BackendTypeError(Exception):
59 |     def __init__(self, other_backend, backend):
60 |         super().__init__(
61 |             f"Can only perform the operation with the same backends. Got {other_backend} expected {backend}"
62 |         )
63 | 
64 | 
65 | class MergeOnError(Exception):
66 |     def __init__(self, on):
67 |         super().__init__(f"Can only merge on one of the columns data. Got {on}")
68 | 
69 | 
70 | class NoRequiredArgumentError(Exception):
71 |     def __init__(self, argument_name):
72 |         super().__init__(f"The required argument {argument_name} has not been passed.")
73 | 
74 | 
75 | class NoneArgumentError(Exception):
76 |     def __init__(self, arg, process):
77 |         super().__init__(f"Argument {arg} is None in process {process}.")
78 | 
79 | 
80 | class InvalidArgumentError(Exception):
81 |     def __init__(self, arg, possible_type):
82 |         super().__init__(
83 |             f"Invalid type for argument {arg}, possible type is is {possible_type}."
84 |         )
85 | 
86 | 
87 | class PairsNotFoundError(Exception):
88 |     def __init__(self):
89 |         super().__init__(
90 |             "Pairs are not found. Check your input data and try execute preprocessing pipeline before matching estimation."
91 |         )
92 | 


--------------------------------------------------------------------------------
/docs/_static/custom.css:
--------------------------------------------------------------------------------
  1 | /* Улучшенные цвета и стили */
  2 | .wy-nav-content {
  3 |     max-width: 1200px;
  4 | }
  5 | 
  6 | /* Красивые заголовки */
  7 | h1 {
  8 |     color: #2980B9;
  9 |     border-bottom: 2px solid #2980B9;
 10 |     padding-bottom: 10px;
 11 | }
 12 | 
 13 | h2 {
 14 |     color: #34495e;
 15 |     margin-top: 30px;
 16 | }
 17 | 
 18 | /* Стилизация блоков кода */
 19 | .highlight {
 20 |     background: #f8f8f8 !important;
 21 |     border: 1px solid #e1e4e8;
 22 |     border-radius: 6px;
 23 |     margin: 1em 0;
 24 | }
 25 | 
 26 | /* Красивые примечания */
 27 | .admonition {
 28 |     border-radius: 6px;
 29 |     padding: 12px;
 30 |     border-left: 4px solid;
 31 | }
 32 | 
 33 | .admonition.note {
 34 |     background-color: #e3f2fd;
 35 |     border-left-color: #2196f3;
 36 | }
 37 | 
 38 | .admonition.warning {
 39 |     background-color: #fff3cd;
 40 |     border-left-color: #ffc107;
 41 | }
 42 | 
 43 | .admonition.tip {
 44 |     background-color: #e8f5e9;
 45 |     border-left-color: #4caf50;
 46 | }
 47 | 
 48 | /* Улучшенные таблицы */
 49 | .wy-table-responsive table td, .wy-table-responsive table th {
 50 |     white-space: normal;
 51 | }
 52 | 
 53 | table.docutils {
 54 |     border: 1px solid #e1e4e8;
 55 |     border-collapse: collapse;
 56 |     border-spacing: 0;
 57 |     width: 100%;
 58 | }
 59 | 
 60 | table.docutils tr:nth-child(2n) {
 61 |     background-color: #f6f8fa;
 62 | }
 63 | 
 64 | /* Красивые кнопки */
 65 | .btn {
 66 |     display: inline-block;
 67 |     padding: 6px 12px;
 68 |     margin-bottom: 0;
 69 |     font-size: 14px;
 70 |     font-weight: 400;
 71 |     line-height: 1.42857143;
 72 |     text-align: center;
 73 |     white-space: nowrap;
 74 |     vertical-align: middle;
 75 |     cursor: pointer;
 76 |     border: 1px solid transparent;
 77 |     border-radius: 4px;
 78 |     background-color: #2980B9;
 79 |     color: white;
 80 |     text-decoration: none;
 81 | }
 82 | 
 83 | .btn:hover {
 84 |     background-color: #21618C;
 85 |     color: white;
 86 | }
 87 | 
 88 | /* API Reference styling */
 89 | .class dt {
 90 |     font-size: 1.1em;
 91 |     font-weight: bold;
 92 |     margin-top: 20px;
 93 |     padding: 10px;
 94 |     background-color: #f0f0f0;
 95 |     border-left: 3px solid #2980B9;
 96 | }
 97 | 
 98 | /* Parameters styling */
 99 | .field-list {
100 |     margin: 20px 0;
101 | }
102 | 
103 | .field-name {
104 |     font-weight: bold;
105 |     min-width: 100px;
106 |     padding-right: 20px;
107 | }
108 | 
109 | /* Навигация */
110 | .wy-menu-vertical li.current > a {
111 |     background: #fcfcfc;
112 |     border-right: solid 3px #2980B9;
113 | }
114 | 
115 | .wy-menu-vertical li.current a {
116 |     color: #404040;
117 | }
118 | 
119 | /* Badges */
120 | .badge {
121 |     display: inline-block;
122 |     padding: 3px 7px;
123 |     font-size: 12px;
124 |     font-weight: bold;
125 |     line-height: 1;
126 |     color: #fff;
127 |     text-align: center;
128 |     white-space: nowrap;
129 |     vertical-align: baseline;
130 |     border-radius: 10px;
131 | }
132 | 
133 | .badge-primary {
134 |     background-color: #2980B9;
135 | }
136 | 
137 | .badge-success {
138 |     background-color: #27ae60;
139 | }
140 | 
141 | .badge-warning {
142 |     background-color: #f39c12;
143 | }


--------------------------------------------------------------------------------
/hypex/experiments/base.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from copy import deepcopy
 4 | from typing import Any, Iterable, Sequence
 5 | 
 6 | from ..dataset import ABCRole, ExperimentData, TempTargetRole
 7 | from ..executor import Executor
 8 | from ..utils import ExperimentDataEnum
 9 | 
10 | 
11 | class Experiment(Executor):
12 |     def _detect_transformer(self) -> bool:
13 |         return all(executor._is_transformer for executor in self.executors)
14 | 
15 |     def get_executor_ids(
16 |         self, searched_classes: type | Iterable[type] | None = None
17 |     ) -> dict[type, list[str]]:
18 |         if not searched_classes:
19 |             return {}
20 | 
21 |         searched_classes = (
22 |             searched_classes
23 |             if isinstance(searched_classes, Iterable)
24 |             else [searched_classes]
25 |         )
26 |         return {
27 |             searched_class: [
28 |                 executor.id
29 |                 for executor in self.executors
30 |                 if isinstance(executor, searched_class)
31 |             ]
32 |             for searched_class in searched_classes
33 |         }
34 | 
35 |     def __init__(
36 |         self,
37 |         executors: Sequence[Executor],
38 |         transformer: bool | None = None,
39 |         key: Any = "",
40 |     ):
41 |         self.executors: Sequence[Executor] = executors
42 |         self.transformer: bool = (
43 |             transformer if transformer is not None else self._detect_transformer()
44 |         )
45 |         super().__init__(key)
46 | 
47 |     def set_params(self, params: dict[str, Any] | dict[type, dict[str, Any]]) -> None:
48 |         if isinstance(next(iter(params)), str):
49 |             super().set_params(params)
50 |         elif isinstance(next(iter(params)), type):
51 |             for executor in self.executors:
52 |                 executor.set_params(params)
53 |         else:
54 |             raise ValueError(
55 |                 "params must be a dict of str to dict or a dict of class to dict"
56 |             )
57 | 
58 |     def _set_value(self, data: ExperimentData, value, key=None) -> ExperimentData:
59 |         return data.set_value(ExperimentDataEnum.analysis_tables, self.id, value)
60 | 
61 |     def execute(self, data: ExperimentData) -> ExperimentData:
62 |         experiment_data = deepcopy(data) if self.transformer else data
63 |         for executor in self.executors:
64 |             executor.key = self.key
65 |             experiment_data = executor.execute(experiment_data)
66 |         return experiment_data
67 | 
68 | 
69 | class OnRoleExperiment(Experiment):
70 |     def __init__(
71 |         self,
72 |         executors: list[Executor],
73 |         role: ABCRole | Sequence[ABCRole],
74 |         transformer: bool | None = None,
75 |         key: Any = "",
76 |     ):
77 |         self.role: list[ABCRole] = [role] if isinstance(role, ABCRole) else list(role)
78 |         super().__init__(executors, transformer, key)
79 | 
80 |     def execute(self, data: ExperimentData) -> ExperimentData:
81 |         for field in data.ds.search_columns(self.role):
82 |             data.ds.tmp_roles = {field: TempTargetRole()}
83 |             data = super().execute(data)
84 |             data.ds.tmp_roles = {}
85 |         return data
86 | 


--------------------------------------------------------------------------------
/hypex/hypotheses/hypothesis.py:
--------------------------------------------------------------------------------
 1 | # import json
 2 | # from typing import Optional, Union, Dict, Any
 3 | #
 4 | # from jsonschema import validate  # type: ignore
 5 | #
 6 | # from hypex.dataset import Dataset, default_roles, InfoRole
 7 | # from hypex.factory.base import Factory
 8 | #
 9 | #
10 | # class Hypothesis:
11 | #     def __init__(self, config: Union[str, Dict[str, Any]]):
12 | #         if isinstance(config, str):
13 | #             with open(config, "rb") as file:
14 | #                 opened_config = json.load(file)
15 | #         else:
16 | #             opened_config = config
17 | #         with open("hypex\\hypotheses\\schemes\\scheme.json", "rb") as file:
18 | #             self.scheme = json.load(file)
19 | #         self.config = opened_config
20 | #         self.dataset = self.config.get("dataset")
21 | #         self.experiment = self.config.get("experiment")
22 | #         self.report = self.config.get("report")
23 | #         self.validate_config()
24 | #         self._parse_config()
25 | #
26 | #     def validate_config(self):
27 | #         validate(self.config, self.scheme)
28 | #         if (
29 | #             "data" in self.dataset.keys()
30 | #             and "path" not in self.dataset.keys()
31 | #             and not self.dataset["data"]["data"]
32 | #         ):
33 | #             raise ValueError("Data or path to data must be added")
34 | #         # if len(self.dataset["roles"]["role_names"]) != len(
35 | #         #     self.dataset["roles"]["columns"]
36 | #         # ):
37 | #         #     raise ValueError(
38 | #         #         f"Invalid number of columns and role_names. Columns and role_names must have equal length.\n "
39 | #         #         f"role_names contains {len(self.dataset['roles']['role_names'])} values and columns contains {len(self.dataset['roles']['columns'])}"
40 | #         #     )
41 | #
42 | #     def _parse_config(self):
43 | #         self.dataset = self._parse_dataset()
44 | #
45 | #     def _parse_dataset(self):
46 | #         data = (
47 | #             self.dataset["data"]
48 | #             if "data" in self.dataset.keys()
49 | #             else self.dataset["path"]
50 | #         )
51 | #         roles = {}
52 | #         for column in self.dataset["columns"]:
53 | #             role = default_roles.get(column["role"].lower(), InfoRole())
54 | #             role.data_type = column["dataType"] if column.get("dataType") else None
55 | #             roles.update({column["name"]: role})
56 | #         return Dataset(data=data, roles=roles, backend=self.dataset["backend"])
57 | #
58 | #     def to_json(self, file: Optional[str] = None):
59 | #         # return json.dumps(self.dataset.to_json(), self.experiment.to_json(), self.report.to_json())
60 | #         if file:
61 | #             with open(file, "w") as f:
62 | #                 json.dump(
63 | #                     {"dataset": self.dataset.to_dict(), "experiment": {}, "report": {}},
64 | #                     f,
65 | #                     indent=4,
66 | #                 )
67 | #         return json.dumps(
68 | #             {"dataset": self.dataset.to_dict(), "experiment": {}, "report": {}}
69 | #         )
70 | #
71 | #     def execute(self):
72 | #         experiment_data, self.experiment = Factory(self).execute()
73 | #         return experiment_data, self.experiment
74 | 


--------------------------------------------------------------------------------
/hypex/reporters/matching.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Any, ClassVar
 4 | 
 5 | from ..analyzers.matching import MatchingAnalyzer
 6 | from ..comparators import KSTest, TTest
 7 | from ..dataset import Dataset, ExperimentData
 8 | from ..ml import FaissNearestNeighbors
 9 | from ..reporters.abstract import DatasetReporter, DictReporter, TestDictReporter
10 | from ..utils import (
11 |     ID_SPLIT_SYMBOL,
12 |     MATCHING_INDEXES_SPLITTER_SYMBOL,
13 |     ExperimentDataEnum,
14 | )
15 | 
16 | 
17 | class MatchingDictReporter(DictReporter):
18 |     def __init__(self, searching_class: type = MatchingAnalyzer):
19 |         self.searching_class = searching_class
20 |         super().__init__()
21 | 
22 |     @staticmethod
23 |     def _convert_dataset_to_dict(data: Dataset) -> dict[str, Any]:
24 |         dict_data = data.to_dict()["data"]
25 |         indexes = dict_data["index"]
26 |         df = dict_data["data"]
27 |         result = {}
28 |         for key, values in df.items():
29 |             for index, value in zip(indexes, values):
30 |                 result[f"{key}{ID_SPLIT_SYMBOL}{index}"] = value
31 |         return result
32 | 
33 |     def _extract_from_analyser(self, data: ExperimentData):
34 |         analyzer_id = data.get_one_id(
35 |             self.searching_class, ExperimentDataEnum.analysis_tables
36 |         )
37 |         return self._convert_dataset_to_dict(data.analysis_tables[analyzer_id])
38 | 
39 |     @staticmethod
40 |     def _extract_from_additional_fields(data: ExperimentData):
41 |         indexes_id = data.get_one_id(
42 |             FaissNearestNeighbors, ExperimentDataEnum.additional_fields
43 |         )
44 |         return {
45 |             "indexes": MATCHING_INDEXES_SPLITTER_SYMBOL.join(
46 |                 str(i)
47 |                 for i in data.additional_fields[indexes_id].to_dict()["data"]["data"][
48 |                     indexes_id
49 |                 ]
50 |             )
51 |         }
52 | 
53 |     def report(self, experiment_data: ExperimentData):
54 |         result = {}
55 |         result.update(self._extract_from_analyser(experiment_data))
56 |         if self.searching_class == MatchingAnalyzer:
57 |             result.update(self._extract_from_additional_fields(experiment_data))
58 |         return result
59 | 
60 | 
61 | class MatchingQualityDictReporter(TestDictReporter):
62 |     tests: ClassVar[list] = [TTest, KSTest]
63 | 
64 |     def report(self, data: ExperimentData) -> dict[str, Any]:
65 |         return self.extract_tests(data)
66 | 
67 | 
68 | class MatchingQualityDatasetReporter(MatchingQualityDictReporter):
69 |     @classmethod
70 |     def convert_flat_dataset(cls, data: dict) -> Dataset:
71 |         struct_dict = cls._get_struct_dict(data)
72 |         return cls._convert_struct_dict_to_dataset(struct_dict)
73 | 
74 |     def report(self, data: ExperimentData):
75 |         front_buffer = self.front
76 |         self.front = False
77 |         dict_report = super().report(data)
78 |         self.front = front_buffer
79 |         return self.convert_flat_dataset(dict_report)
80 | 
81 | 
82 | class MatchingDatasetReporter(DatasetReporter):
83 |     def __init__(self, searching_class: type = MatchingAnalyzer) -> None:
84 |         self.dict_reporter = MatchingDictReporter(searching_class)
85 |         super().__init__(self.dict_reporter)
86 | 


--------------------------------------------------------------------------------
/hypex/utils/decorator.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from functools import wraps
 4 | from typing import Any, Callable, cast
 5 | 
 6 | from hypex.utils import DecoratedType, DocstringInheritDecorator
 7 | 
 8 | 
 9 | def inherit_docstring_from(
10 |         source: Callable[..., Any] | property,
11 | ) -> DocstringInheritDecorator:
12 |     """A decorator to inherit the docstring from another function or property.
13 | 
14 |     This decorator can be applied to both callable objects and properties. It copies the docstring
15 |     from the source object to the decorated object if the latter does not already have a docstring.
16 | 
17 |     Args:
18 |         source: The object from which the docstring will be inherited.
19 |             This should be either a callable or a property that has
20 |             a well-defined __doc__ attribute.
21 | 
22 |     Returns:
23 |         A decorator that when applied to a function or property,
24 |         sets its __doc__ attribute to that of the source.
25 | 
26 |     Raises:
27 |         TypeError: If the object to be decorated is neither a callable nor a property.
28 | 
29 |     Example:
30 |         Using with property::
31 | 
32 |             class SomeClass:
33 |                 @property
34 |                 @inherit_docstring_from(pd.DataFrame.iloc)
35 |                 def iloc(self):
36 |                     return self._data.iloc
37 | 
38 |         Using with method::
39 | 
40 |             @inherit_docstring_from(pd.DataFrame.mean)
41 |             def mean(self):
42 |                 return self._data.mean()
43 |     """
44 | 
45 |     def decorator(obj: DecoratedType) -> DecoratedType:
46 |         """
47 |         Apply the inherited docstring to a given function or property.
48 |         This function acts as a decorator within 'inherit_docstring_from', applying the docstring
49 |         from the 'source' object to the 'obj'. If 'obj' is a property, it modifies the property to include
50 |         the source's docstring. If 'obj' is a callable, it wraps the callable in a function that preserves
51 |         the original callable's functionality and metadata but updates the docstring.
52 |         Args:
53 |             obj (DecoratedType): The function or property to which the docstring will be applied.
54 |                                  It must be either a callable or a property.
55 |         Returns:
56 |             DecoratedType: The original object with the updated docstring. If the object is a property,
57 |                            it returns a new property object with the inherited docstring. If it's a callable,
58 |                            it returns the wrapped callable with the updated docstring.
59 |         Raises:
60 |             TypeError: If 'obj' is neither a callable nor a property.
61 |         """
62 |         if isinstance(obj, property):
63 |             doc = getattr(source, "__doc__", "No documentation provided.")
64 |             return property(obj.fget, obj.fset, obj.fdel, doc)
65 |         elif callable(obj):
66 | 
67 |             @wraps(obj)
68 |             def wrapper(*args, **kwargs) -> Any:
69 |                 return obj(*args, **kwargs)
70 | 
71 |             wrapper.__doc__ = getattr(source, "__doc__", "No documentation provided.")
72 |             return cast(DecoratedType, wrapper)
73 |         else:
74 |             raise TypeError(
75 |                 "The decorator can only be applied to callables or properties."
76 |             )
77 | 
78 |     return decorator
79 | 


--------------------------------------------------------------------------------
/schemes/architecture_levels.md:
--------------------------------------------------------------------------------
 1 | # Уровни абстракции HypEx
 2 | Разделение на уровни абстракции необходимы для того, чтобы упростить работу с проектом для разных категорий пользователей.
 3 | 
 4 | ## Уровень 1. Пользовательский интерфейс на платформе.
 5 | **Пользователь** - бизнес пользователь, который хочет получить результаты анализа данных.
 6 | 
 7 | **Сегмент** - платформа
 8 | 
 9 | **Идея** - предоставить пользователю возможность работать с проектом без специальных знаний по статистике и написания кода. В идеале, пользователь просто выбирает сценарий под свой проект, разработанный финансистами, и запускает его.
10 | 
11 | **Использование** - запуск сценария в интерфейсе на платформе.
12 | 
13 | ## Уровень 2. Создание сценариев в конструкторе на платформе.
14 | **Пользователь** - финансист, который хочет создать сценарий для решения задачи бизнеса.
15 | 
16 | **Сегмент** - платформа
17 | 
18 | **Идея** - предоставить пользователю со знанием статистики и пониманием бизнес процессов возможность создания сценариев в удобном конструкторе без необходимости написания кода.
19 | 
20 | **Использование** -  создание сценариев в конструкторе на платформе с помощью графического интерфейса.
21 | 
22 | ## Уровень 3. Настраиваемый в шаблонном коде сценарий.
23 | **Пользователь** - бизнес пользователь с доступом к данным и лабораторной зоне для запуска сценария.
24 | 
25 | **Сегмент** - Dev пакет
26 | 
27 | **Идея** - предоставить пользователю возможность запускать заранее запрограммированный сценарий с настраиваемыми параметрами.
28 | 
29 | **Использование** - запуск шаблонного кода с настраиваемыми параметрами в лабораторной зоне.
30 | 
31 | ## Уровень 4. Использование оболочки эксперимента HypEx.
32 | **Пользователь** - исследователь данных, изучивший туториал или документацию HypEx
33 | 
34 | **Сегмент** - библиотека
35 | 
36 | **Идея** - предоставить пользователю возможность настраивать и запускать базовые эксперименты в пару строчек кода.
37 | 
38 | **Использование** - настраивание и запуск эксперимента в пару строчек кода с использованием оболочки эксперимента HypEx.
39 | 
40 | 
41 | ## Уровень 5. Создание эксперимента в коде.
42 | **Пользователь** - разработчик, знакомый с базовыми блоками библиотеки HypEx (Executor).
43 | 
44 | **Сегмент** - библиотека
45 | 
46 | **Идея** - предоставить пользователю возможность создавать кастомные эксперименты в коде из базовых блоков библиотеки HypEx.
47 | 
48 | **Использование** - создание эксперимента в коде с использованием базовых блоков библиотеки HypEx.
49 | 
50 | ## Уровень 6. Создание базового блока (Executor) насследованием от типового блока.
51 | **Пользователь** - разработчик, знакомый с типовыми блоками библиотеки HypEx.
52 | **Сегмент** - библиотека
53 | **Идея** - предоставить пользователю возможность создавать кастомные базовые блоки (Executor) наследованием от типовых блоков библиотеки HypEx.
54 | **Использование** - создание кастомного базового блока (Executor) наследованием от типового блока библиотеки HypEx.
55 | 
56 | ## Уровень 7. Модификация библиотеки.
57 | **Пользователь** - хорошо знакомый с тем, как устроени и работает библиотека HypEx разработчик.
58 | 
59 | **Сегмент** - библиотека
60 | 
61 | **Идея** - глубокие доработки базовых механик библиотеки HypEx.
62 | 
63 | **Использование** - глубокие доработки базовых механик библиотеки HypEx.
64 | 
65 | ## Уровень 8. Ядро
66 | **Пользователь** - архитектор HypEx
67 | 
68 | **Сегмент** - библиотека
69 | 
70 | **Идея** - изменение фундаментального поведения библиотеки HypEx. Обычно означает нового поколения архитектуры.
71 | 
72 | **Использование** - создание ядра библиотеки HypEx


--------------------------------------------------------------------------------
/hypex/operators/abstract.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from abc import abstractmethod
  4 | from typing import Any, Sequence
  5 | 
  6 | from ..dataset import (
  7 |     ABCRole,
  8 |     AdditionalTargetRole,
  9 |     Dataset,
 10 |     ExperimentData,
 11 |     GroupingRole,
 12 |     TargetRole,
 13 | )
 14 | from ..executor import Calculator
 15 | from ..utils import AbstractMethodError, ExperimentDataEnum, NotSuitableFieldError
 16 | from ..utils.adapter import Adapter
 17 | 
 18 | 
 19 | class GroupOperator(
 20 |     Calculator
 21 | ):  # TODO: change the derive from Calculator to COmparator
 22 |     def __init__(
 23 |         self,
 24 |         grouping_role: ABCRole | None = None,
 25 |         target_roles: ABCRole | list[ABCRole] | None = None,
 26 |         key: Any = "",
 27 |     ):
 28 |         super().__init__(key=key)
 29 |         self.target_roles = target_roles or TargetRole()
 30 |         self.grouping_role = grouping_role or GroupingRole()
 31 | 
 32 |     @property
 33 |     def search_types(self):
 34 |         return None
 35 | 
 36 |     @classmethod
 37 |     @abstractmethod
 38 |     def _inner_function(
 39 |         cls, data: Dataset, test_data: Dataset | None = None, **kwargs
 40 |     ) -> Any:
 41 |         raise AbstractMethodError
 42 | 
 43 |     def _get_fields(self, data: ExperimentData):
 44 |         group_field = data.field_search(self.grouping_role)
 45 |         target_fields = data.field_search(
 46 |             self.target_roles, search_types=self.search_types
 47 |         )
 48 |         if len(target_fields) != 2:
 49 |             target_fields += data.field_search(
 50 |                 AdditionalTargetRole(), search_types=self.search_types
 51 |             )
 52 |         return group_field, target_fields
 53 | 
 54 |     @classmethod
 55 |     def _execute_inner_function(
 56 |         cls,
 57 |         grouping_data,
 58 |         target_fields: list[str] | None = None,
 59 |         **kwargs,
 60 |     ) -> dict:
 61 |         if target_fields is None or len(target_fields) != 2:
 62 |             raise ValueError(
 63 |                 f"This operator works with 2 targets, but got {len(target_fields) if target_fields else None}"
 64 |             )
 65 |         result = {}
 66 |         for group, group_data in grouping_data:
 67 |             result[group[0]] = cls._inner_function(
 68 |                 data=group_data[target_fields[0]],
 69 |                 test_data=group_data[target_fields[1]],
 70 |                 **kwargs,
 71 |             )
 72 |         return result
 73 | 
 74 |     @classmethod
 75 |     def calc(
 76 |         cls,
 77 |         data: Dataset,
 78 |         group_field: Sequence[str] | str | None = None,
 79 |         grouping_data: list[tuple[str, Dataset]] | None = None,
 80 |         target_fields: str | list[str] | None = None,
 81 |         **kwargs,
 82 |     ) -> dict:
 83 |         group_field = Adapter.to_list(group_field)
 84 | 
 85 |         if grouping_data is None:
 86 |             grouping_data = data.groupby(group_field)
 87 |         if len(grouping_data) > 1:
 88 |             grouping_data[0][1].tmp_roles = data.tmp_roles
 89 |         else:
 90 |             raise NotSuitableFieldError(group_field, "Grouping")
 91 |         return cls._execute_inner_function(
 92 |             grouping_data, target_fields=target_fields, old_data=data, **kwargs
 93 |         )
 94 | 
 95 |     def _set_value(
 96 |         self, data: ExperimentData, value: dict | None = None, key: Any = None
 97 |     ) -> ExperimentData:
 98 |         data.set_value(
 99 |             ExperimentDataEnum.variables,
100 |             self.id,
101 |             value,
102 |         )
103 |         return data
104 | 


--------------------------------------------------------------------------------
/examples/experiments/performance_test/config.schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "$schema": "http://json-schema.org/draft-07/schema#",
  3 |     "title": "Parameters Configuration",
  4 |     "description": "Schema for onefactor, montecarlo and fixed parameters",
  5 |     "type": "object",
  6 |     "properties": {
  7 |       "onefactor_params": {
  8 |         "type": "object",
  9 |         "properties": {
 10 |           "n_rows": {
 11 |             "type": "array",
 12 |             "items": {
 13 |               "type": "integer",
 14 |               "minimum": 1
 15 |             }
 16 |           },
 17 |           "n_columns": {
 18 |             "type": "array",
 19 |             "items": {
 20 |               "type": "integer",
 21 |               "minimum": 1
 22 |             }
 23 |           },
 24 |           "n_iterations": {
 25 |             "type": "array",
 26 |             "items": {
 27 |               "type": "integer",
 28 |               "minimum": 1
 29 |             }
 30 |           }
 31 |         },
 32 |         "additionalProperties": true
 33 |       },
 34 |       "montecarlo_params": {
 35 |         "type": "object",
 36 |         "properties": {
 37 |           "num_points": {
 38 |             "type": "integer",
 39 |             "minimum": 1
 40 |           },
 41 |           "bounds": {
 42 |             "type": "object",
 43 |             "properties": {
 44 |               "n_rows": {
 45 |                 "type": "object",
 46 |                 "properties": {
 47 |                   "max": {
 48 |                     "type": "integer",
 49 |                     "minimum": 1
 50 |                   },
 51 |                   "min": {
 52 |                     "type": "integer",
 53 |                     "minimum": 1
 54 |                   }
 55 |                 },
 56 |                 "required": ["max", "min"],
 57 |                 "additionalProperties": false
 58 |               },
 59 |               "n_iterations": {
 60 |                 "type": "object",
 61 |                 "properties": {
 62 |                   "max": {
 63 |                     "type": "integer",
 64 |                     "minimum": 1
 65 |                   },
 66 |                   "min": {
 67 |                     "type": "integer",
 68 |                     "minimum": 1
 69 |                   }
 70 |                 },
 71 |                 "required": ["max", "min"],
 72 |                 "additionalProperties": false
 73 |               },
 74 |               "n_columns": {
 75 |                 "type": "object",
 76 |                 "properties": {
 77 |                   "max": {
 78 |                     "type": "integer",
 79 |                     "minimum": 1
 80 |                   },
 81 |                   "min": {
 82 |                     "type": "integer",
 83 |                     "minimum": 1
 84 |                   }
 85 |                 },
 86 |                 "required": ["max", "min"],
 87 |                 "additionalProperties": false
 88 |               }
 89 |             },
 90 |             "additionalProperties": true
 91 |           }
 92 |         },
 93 |         "required": ["num_points", "bounds"],
 94 |         "additionalProperties": true
 95 |       },
 96 |       "fixed_params": {
 97 |         "type": "object",
 98 |         "properties": {
 99 |           "n_columns": {
100 |             "type": "integer",
101 |             "minimum": 1
102 |           },
103 |           "n_rows": {
104 |             "type": "integer",
105 |             "minimum": 1
106 |           },
107 |           "n_iterations": {
108 |             "type": "integer",
109 |             "minimum": 1
110 |           }
111 |         },
112 |         "additionalProperties": true
113 |       }
114 |     },
115 |     "additionalProperties": true
116 |   }


--------------------------------------------------------------------------------
/hypex/comparators/comparators.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import Literal
  4 | 
  5 | import numpy as np
  6 | 
  7 | from ..dataset import ABCRole, Dataset
  8 | from ..utils.constants import NUMBER_TYPES_LIST
  9 | from .abstract import Comparator
 10 | 
 11 | NUM_OF_BUCKETS = 10
 12 | 
 13 | 
 14 | class GroupDifference(Comparator):
 15 |     def __init__(
 16 |         self,
 17 |         compare_by: Literal[
 18 |             "groups", "columns", "columns_in_groups", "cross", "matched_pairs"
 19 |         ] = "groups",
 20 |         grouping_role: ABCRole | None = None,
 21 |         target_roles: ABCRole | list[ABCRole] | None = None,
 22 |     ):
 23 |         super().__init__(
 24 |             compare_by=compare_by,
 25 |             grouping_role=grouping_role,
 26 |             target_roles=target_roles,
 27 |         )
 28 | 
 29 |     @property
 30 |     def search_types(self) -> list[type] | None:
 31 |         return NUMBER_TYPES_LIST
 32 | 
 33 |     @classmethod
 34 |     def _inner_function(
 35 |         cls,
 36 |         data: Dataset,
 37 |         test_data: Dataset | None = None,
 38 |         **kwargs,
 39 |     ) -> dict:
 40 |         test_data = cls._check_test_data(test_data)
 41 |         control_mean = data.mean()
 42 |         test_mean = test_data.mean()
 43 | 
 44 |         return {
 45 |             "control mean": control_mean,
 46 |             "test mean": test_mean,
 47 |             "difference": test_mean - control_mean,
 48 |             "difference %": (
 49 |                 (test_mean / control_mean - 1) * 100 if control_mean != 0 else None
 50 |             ),
 51 |         }
 52 | 
 53 | 
 54 | class GroupSizes(Comparator):
 55 |     def __init__(
 56 |         self,
 57 |         compare_by: Literal[
 58 |             "groups", "columns", "columns_in_groups", "cross", "matched_pairs"
 59 |         ] = "groups",
 60 |         grouping_role: ABCRole | None = None,
 61 |     ):
 62 |         super().__init__(
 63 |             compare_by=compare_by,
 64 |             grouping_role=grouping_role,
 65 |             target_roles=grouping_role,
 66 |         )
 67 | 
 68 |     @classmethod
 69 |     def _inner_function(
 70 |         cls, data: Dataset, test_data: Dataset | None = None, **kwargs
 71 |     ) -> dict:
 72 |         size_a = len(data)
 73 |         size_b = len(test_data) if isinstance(test_data, Dataset) else 0
 74 | 
 75 |         return {
 76 |             "control size": size_a,
 77 |             "test size": size_b,
 78 |             "control size %": (size_a / (size_a + size_b)) * 100,
 79 |             "test size %": (size_b / (size_a + size_b)) * 100,
 80 |         }
 81 | 
 82 | 
 83 | class PSI(Comparator):
 84 |     @classmethod
 85 |     def _inner_function(
 86 |         cls, data: Dataset, test_data: Dataset | None = None, **kwargs
 87 |     ) -> dict[str, float]:
 88 |         test_data = cls._check_test_data(test_data=test_data)
 89 |         data.sort(ascending=False)
 90 |         test_data.sort(ascending=False)
 91 |         data_column = data.iloc[:, 0]
 92 |         test_data_column = test_data.iloc[:, 0]
 93 |         data_bins = np.arange(
 94 |             data_column.min(),
 95 |             data_column.max(),
 96 |             (data_column.max() - data_column.min()) / NUM_OF_BUCKETS,
 97 |         )
 98 |         test_data_bins = np.arange(
 99 |             test_data_column.min(),
100 |             test_data_column.max(),
101 |             (test_data_column.max() - test_data_column.min()) / NUM_OF_BUCKETS,
102 |         )
103 |         data_groups = data_column.groupby(
104 |             data_column.cut(data_bins).get_values(column=data.columns[0])
105 |         )
106 |         test_data_groups = test_data_column.groupby(
107 |             test_data_column.cut(test_data_bins).get_values(column=test_data.columns[0])
108 |         )
109 | 
110 |         data_psi = [x[1].count() / len(data) for x in data_groups]
111 |         test_data_psi = [x[1].count() / len(test_data) for x in test_data_groups]
112 |         psi = [(y - x) * np.log(y / x) for x, y in zip(data_psi, test_data_psi)]
113 |         return {"PSI": sum(psi)}
114 | 


--------------------------------------------------------------------------------
/hypex/ui/matching.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import Any
  4 | 
  5 | from ..analyzers.matching import MatchingAnalyzer
  6 | from ..dataset import (
  7 |     AdditionalMatchingRole,
  8 |     Dataset,
  9 |     ExperimentData,
 10 |     GroupingRole,
 11 |     StatisticRole,
 12 |     TargetRole,
 13 | )
 14 | from ..reporters.matching import MatchingDictReporter, MatchingQualityDatasetReporter
 15 | from ..utils import ID_SPLIT_SYMBOL, MATCHING_INDEXES_SPLITTER_SYMBOL
 16 | from .base import Output
 17 | 
 18 | 
 19 | class MatchingOutput(Output):
 20 |     resume: Dataset
 21 |     full_data: Dataset
 22 |     quality_results: Dataset
 23 | 
 24 |     def __init__(self, searching_class: type = MatchingAnalyzer):
 25 |         super().__init__(
 26 |             resume_reporter=MatchingDictReporter(searching_class),
 27 |             additional_reporters=MatchingQualityDatasetReporter(),
 28 |         )
 29 | 
 30 |     def _extract_full_data(self, experiment_data: ExperimentData, indexes: Dataset):
 31 |         indexes.index = experiment_data.ds.index
 32 |         filtered_field = indexes.drop(
 33 |             indexes[indexes[indexes.columns[0]] == -1], axis=0
 34 |         )
 35 |         matched_data = experiment_data.ds.loc[
 36 |             list(map(lambda x: x[0], filtered_field.get_values()))
 37 |         ].rename({i: i + "_matched" for i in experiment_data.ds.columns})
 38 |         matched_data.index = filtered_field.index
 39 |         self.indexes = indexes
 40 |         self.full_data = experiment_data.ds.append(
 41 |             matched_data.reindex(experiment_data.ds.index), axis=1
 42 |         )
 43 | 
 44 |     def extract(self, experiment_data: ExperimentData):
 45 |         resume = self.resume_reporter.report(experiment_data)
 46 |         reformatted_resume: dict[str, Any] = {}
 47 |         for key, value in resume.items():
 48 |             if ID_SPLIT_SYMBOL in key:
 49 |                 keys = key.split(ID_SPLIT_SYMBOL)
 50 |                 temp_key = keys[0] if len(keys) < 3 else f"{keys[2]} {keys[0]}"
 51 |                 if temp_key not in reformatted_resume:
 52 |                     reformatted_resume[temp_key] = {}
 53 |                 reformatted_resume[temp_key].update({keys[1]: value})
 54 |         if "indexes" in reformatted_resume.keys():
 55 |             group_indexes_id = experiment_data.ds.search_columns(GroupingRole())
 56 |             indexes = [
 57 |                 Dataset.from_dict(
 58 |                     {
 59 |                         "indexes": list(
 60 |                             map(int, values.split(MATCHING_INDEXES_SPLITTER_SYMBOL))
 61 |                         )
 62 |                     },
 63 |                     index=experiment_data.ds[
 64 |                         experiment_data.ds[group_indexes_id] == group
 65 |                     ].index,
 66 |                     roles={"indexes": StatisticRole()},
 67 |                 )
 68 |                 for group, values in reformatted_resume.pop("indexes").items()
 69 |             ]
 70 |             indexes = indexes[0].append(indexes[1:]).sort()
 71 |         else:
 72 |             indexes = Dataset.from_dict(
 73 |                 {
 74 |                     "indexes": list(
 75 |                         map(
 76 |                             int,
 77 |                             resume["indexes"].split(MATCHING_INDEXES_SPLITTER_SYMBOL),
 78 |                         )
 79 |                     )
 80 |                 },
 81 |                 roles={"indexes": AdditionalMatchingRole()},
 82 |             )
 83 | 
 84 |         outcome = experiment_data.field_search(TargetRole())[0]
 85 |         reformatted_resume["outcome"] = {
 86 |             key: outcome
 87 |             for key in reformatted_resume[next(iter(reformatted_resume.keys()))].keys()
 88 |         }
 89 | 
 90 |         self.resume = Dataset.from_dict(
 91 |             reformatted_resume,
 92 |             roles={
 93 |                 column: StatisticRole() for column in list(reformatted_resume.keys())
 94 |             },
 95 |         )
 96 |         self._extract_full_data(
 97 |             experiment_data,
 98 |             indexes,
 99 |         )
100 |         self.resume = round(self.resume, 2)
101 | 
102 |         self.quality_results = self.additional_reporters.report(experiment_data)
103 | 


--------------------------------------------------------------------------------
/hypex/dataset/roles.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from abc import ABC
  4 | 
  5 | from ..utils import CategoricalTypes, DefaultRoleTypes, RoleNameType, TargetRoleTypes
  6 | 
  7 | 
  8 | class ABCRole(ABC):
  9 |     _role_name: RoleNameType = "Abstract"
 10 | 
 11 |     def __init__(self, data_type: DefaultRoleTypes | None = None):
 12 |         self.data_type = data_type
 13 | 
 14 |     @property
 15 |     def role_name(self) -> str:
 16 |         return self._role_name
 17 | 
 18 |     def __repr__(self) -> str:
 19 |         return f"{self._role_name}({self.data_type})"
 20 | 
 21 | 
 22 | class InfoRole(ABCRole):
 23 |     _role_name: RoleNameType = "Info"
 24 | 
 25 | 
 26 | class StratificationRole(ABCRole):
 27 |     _role_name: RoleNameType = "Stratification"
 28 | 
 29 |     def __init__(self, data_type: CategoricalTypes | None = None):
 30 |         super().__init__(data_type)
 31 | 
 32 | 
 33 | class GroupingRole(ABCRole):
 34 |     _role_name: RoleNameType = "Grouping"
 35 | 
 36 |     def __init__(self, data_type: CategoricalTypes | None = None):
 37 |         super().__init__(data_type)
 38 | 
 39 | 
 40 | class TreatmentRole(ABCRole):
 41 |     _role_name: RoleNameType = "Treatment"
 42 | 
 43 | 
 44 | class TargetRole(ABCRole):
 45 |     _role_name: RoleNameType = "Target"
 46 | 
 47 |     def __init__(self, data_type: TargetRoleTypes | None = None):
 48 |         super().__init__(data_type)
 49 | 
 50 | 
 51 | class FeatureRole(ABCRole):
 52 |     _role_name: RoleNameType = "Feature"
 53 | 
 54 | 
 55 | class PreTargetRole(ABCRole):
 56 |     _role_name: RoleNameType = "PreTarget"
 57 | 
 58 |     def __init__(self, data_type: TargetRoleTypes | None = None):
 59 |         super().__init__(data_type)
 60 | 
 61 | 
 62 | class StatisticRole(ABCRole):
 63 |     _role_name: RoleNameType = "Statistic"
 64 | 
 65 | 
 66 | class ResumeRole(ABCRole):
 67 |     _role_name = "Resume"
 68 | 
 69 | 
 70 | class FilterRole(ABCRole):
 71 |     _role_name: RoleNameType = "Filter"
 72 | 
 73 | 
 74 | class ConstGroupRole(ABCRole):
 75 |     _role_name: RoleNameType = "ConstGroup"
 76 | 
 77 | 
 78 | # ___________________________________________________________________________________________
 79 | class TempRole(ABCRole):
 80 |     _role_name: RoleNameType = "Temp"
 81 | 
 82 | 
 83 | class TempTreatmentRole(TempRole, TreatmentRole):
 84 |     _role_name: RoleNameType = "TempTreatment"
 85 | 
 86 | 
 87 | class TempTargetRole(TempRole, TargetRole):
 88 |     _role_name: RoleNameType = "TempTarget"
 89 | 
 90 | 
 91 | class TempGroupingRole(TempRole, GroupingRole):
 92 |     _role_name: RoleNameType = "TempGrouping"
 93 | 
 94 | 
 95 | class DefaultRole(ABCRole):
 96 |     _role_name: RoleNameType = "Default"
 97 | 
 98 | 
 99 | class ReportRole(ABCRole):
100 |     _role_name: RoleNameType = "Report"
101 | 
102 | 
103 | # ___________________________________________________________________________________________
104 | class AdditionalRole(ABCRole):
105 |     _role_name: RoleNameType = "Additional"
106 | 
107 | 
108 | class AdditionalTreatmentRole(AdditionalRole):
109 |     _role_name: RoleNameType = "AdditionalTreatment"
110 | 
111 | 
112 | class AdditionalGroupingRole(AdditionalRole):
113 |     _role_name: RoleNameType = "AdditionalGrouping"
114 | 
115 | 
116 | class AdditionalTargetRole(AdditionalRole):
117 |     _role_name: RoleNameType = "AdditionalTarget"
118 | 
119 | 
120 | class AdditionalPreTargetRole(AdditionalRole):
121 |     _role_name: RoleNameType = "AdditionalPreTarget"
122 | 
123 | 
124 | class AdditionalMatchingRole(AdditionalRole):
125 |     _role_name: RoleNameType = "AdditionalMatching"
126 | 
127 | 
128 | default_roles: dict[RoleNameType, ABCRole] = {
129 |     "info": InfoRole(),
130 |     "default": DefaultRole(),
131 |     "feature": FeatureRole(),
132 |     "treatment": TreatmentRole(),
133 |     "grouping": GroupingRole(),
134 |     "target": TargetRole(),
135 |     "pretarget": PreTargetRole(),
136 |     "stratification": StratificationRole(),
137 |     "statistic": StatisticRole(),
138 |     "filter": FilterRole(),
139 |     "constgroup": ConstGroupRole(),
140 |     "additionaltreatment": AdditionalTreatmentRole(),
141 |     "additionalgrouping": AdditionalGroupingRole(),
142 |     "additionaltarget": AdditionalTargetRole(),
143 |     "additionalpretarget": AdditionalPreTargetRole(),
144 | }
145 | 


--------------------------------------------------------------------------------
/hypex/comparators/power_testing.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from abc import ABC, abstractmethod
  4 | from typing import Any
  5 | 
  6 | import numpy as np
  7 | from scipy.stats import norm
  8 | 
  9 | from ..dataset import ABCRole, Dataset, ExperimentData
 10 | from ..utils import ExperimentDataEnum
 11 | from .comparators import Comparator
 12 | 
 13 | 
 14 | class PowerTesting(Comparator, ABC):
 15 |     def __init__(
 16 |         self,
 17 |         grouping_role: ABCRole | None = None,
 18 |         # space: SpaceEnum = SpaceEnum.auto,
 19 |         significance: float = 0.95,
 20 |         power: float = 0.8,
 21 |         key: Any = "",
 22 |     ):
 23 |         super().__init__(
 24 |             compare_by="groups",
 25 |             grouping_role=grouping_role,
 26 |             # space=space,
 27 |             key=key,
 28 |         )
 29 |         self.significance = significance
 30 |         self.power = power
 31 | 
 32 |     @classmethod
 33 |     @abstractmethod
 34 |     def _inner_function(
 35 |         cls,
 36 |         data: Dataset,
 37 |         test_data: Dataset | None = None,
 38 |         significance: float = 0.95,
 39 |         power: float = 0.8,
 40 |         **kwargs,
 41 |     ) -> float:
 42 |         pass
 43 | 
 44 |     def execute(self, data: ExperimentData) -> ExperimentData:
 45 |         return super().execute(data)
 46 | 
 47 | 
 48 | class MDEBySize(PowerTesting):
 49 |     def _set_value(
 50 |         self, data: ExperimentData, value: Dataset | None = None, key: Any = None
 51 |     ) -> ExperimentData:
 52 |         data.set_value(
 53 |             ExperimentDataEnum.variables,
 54 |             self.id,
 55 |             value,
 56 |         )
 57 |         return data
 58 | 
 59 |     @classmethod
 60 |     def _inner_function(
 61 |         cls,
 62 |         data: Dataset,
 63 |         test_data: Dataset | None = None,
 64 |         significance: float = 0.95,
 65 |         power: float = 0.8,
 66 |         **kwargs,
 67 |     ) -> float:
 68 |         m = norm.ppf((1 + significance) / 2) + norm.ppf(power)
 69 |         if not test_data:
 70 |             raise ValueError("test_data is required")
 71 | 
 72 |         n_test, n_control = len(test_data), len(data)
 73 | 
 74 |         var_test, var_control = test_data.var(ddof=1), data.var(ddof=1)
 75 |         s = np.sqrt(var_test / n_test + var_control / n_control)
 76 | 
 77 |         return m * s
 78 | 
 79 | 
 80 | #
 81 | #
 82 | # class StatPowerByTTestInd(TestPower):
 83 | #
 84 | #     def _inner_function(self, control_data, test_data) -> ExperimentData:
 85 | #         control_size = len(control_data)
 86 | #         test_size = len(test_data)
 87 | #
 88 | #         analysis = TTestIndPower()
 89 | #         ratio = test_size / control_size
 90 | #         return analysis.power(
 91 | #             effect_size=effect_size,
 92 | #             nobs1=test_size,
 93 | #             ratio=ratio,
 94 | #             alpha=significance,
 95 | #
 96 | 
 97 | 
 98 | # class MDEBySize(GroupComparator):
 99 | #     def __init__(
100 | #         self,
101 | #         grouping_role: Optional[ABCRole] = None,
102 | #         space: SpaceEnum = SpaceEnum.auto,
103 | #         full_name: Optional[str] = None,
104 | #         key: Any = "",
105 | #         power: float = 0.8,
106 | #         significance: float = 0.95,
107 | #     ):
108 | #         super().__init__(grouping_role, space, full_name, key)
109 | #         self.power = power
110 | #         self.significance = significance
111 | #
112 | #     @staticmethod
113 | #     def _inner_function(
114 | #         control_data, test_data, significance=0.95, power=0.8, **kwargs
115 | #     ) -> Dict[str, Any]:
116 | #         result = {}
117 | #         m = norm.ppf(1 - significance / 2) - norm.ppf(power)
118 | #         n_control, n_test = len(control_data), len(test_data)
119 | #         proportion = n_test / (n_test + n_control)
120 | #         p = np.sqrt(1 / (proportion * (1 - proportion)))
121 | #         for target in control_data.columns:
122 | #             var_control = control_data[target].var()
123 | #             var_test = test_data[target].var()
124 | #             s = np.sqrt(var_test / n_test + var_control / n_control)
125 | #             result[target] = p * m * s
126 | #
127 | #         return result
128 | #
129 | #     @staticmethod
130 | #     def calc(
131 | #         cls: Dataset,
132 | #         data: Union[Sequence[str], str, None],
133 | #         group_field: Optional[str] = None,
134 | #         grouping_data=None,
135 | #         target_fields=None,
136 | #         **kwargs
137 | #     ):
138 | #         return GroupComparator.calc(
139 | #             data=data,
140 | #             group_field=group_field,
141 | #             target_fields=target_fields,
142 | #             comparison_function=MDEBySize._inner_function,
143 | #             power=power,
144 | #             significance=target_fields,
145 | #         )
146 | #
147 | #     def execute(self, data: ExperimentData) -> ExperimentData:
148 | #         subdata = data.ds.loc[
149 | #             :, data.ds.get_columns_by_roles([TargetRole(), self.grouping_role])
150 | #         ]
151 | #         ed = super().execute(ExperimentData(subdata))
152 | #         return self._set_value(data, ed.analysis_tables[self._id])
153 | 


--------------------------------------------------------------------------------
/hypex/comparators/distances.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from copy import deepcopy
  4 | from typing import Any, Sequence
  5 | 
  6 | from ..dataset import (
  7 |     ABCRole,
  8 |     Dataset,
  9 |     ExperimentData,
 10 |     FeatureRole,
 11 |     GroupingRole,
 12 |     TargetRole,
 13 | )
 14 | from ..executor import Calculator
 15 | from ..extensions.scipy_linalg import CholeskyExtension, InverseExtension
 16 | from ..utils import ExperimentDataEnum, NotSuitableFieldError
 17 | from ..utils.adapter import Adapter
 18 | 
 19 | 
 20 | class MahalanobisDistance(Calculator):
 21 |     def __init__(
 22 |         self,
 23 |         grouping_role: ABCRole | None = None,
 24 |         key: Any = "",
 25 |     ):
 26 |         super().__init__(key=key)
 27 |         self.grouping_role = grouping_role or GroupingRole()
 28 | 
 29 |     @classmethod
 30 |     def _execute_inner_function(
 31 |         cls,
 32 |         grouping_data,
 33 |         target_fields: list[str] | None = None,
 34 |         **kwargs,
 35 |     ) -> dict:
 36 |         result = {}
 37 |         for i in range(1, len(grouping_data)):
 38 |             result.update(
 39 |                 cls._inner_function(
 40 |                     data=(
 41 |                         grouping_data[0][1][target_fields]
 42 |                         if target_fields
 43 |                         else grouping_data[0][1]
 44 |                     ),
 45 |                     test_data=(
 46 |                         grouping_data[i][1][target_fields]
 47 |                         if target_fields
 48 |                         else grouping_data[i][1]
 49 |                     ),
 50 |                     **kwargs,
 51 |                 )
 52 |             )
 53 |         return result
 54 | 
 55 |     def _set_value(
 56 |         self, data: ExperimentData, value: dict | None = None, key: Any = None
 57 |     ) -> ExperimentData:
 58 |         for key, value_ in value.items():
 59 |             data = data.set_value(
 60 |                 ExperimentDataEnum.groups,
 61 |                 self.id,
 62 |                 value_,
 63 |                 key=key,
 64 |             )
 65 |         return data
 66 | 
 67 |     def _get_fields(self, data: ExperimentData):
 68 |         group_field = data.field_search(self.grouping_role)
 69 |         target_fields = data.field_search(FeatureRole(), search_types=self.search_types)
 70 |         return group_field, target_fields
 71 | 
 72 |     @property
 73 |     def search_types(self) -> list[type] | None:
 74 |         return [int, float]
 75 | 
 76 |     @classmethod
 77 |     def _inner_function(cls, data: Dataset, test_data: Dataset | None = None, **kwargs):
 78 |         test_data = cls._check_test_data(test_data)
 79 |         cov = (data.cov() + test_data.cov()) / 2 if test_data else data.cov()
 80 |         cholesky = CholeskyExtension().calc(cov)
 81 |         mahalanobis_transform = InverseExtension().calc(cholesky)
 82 |         y_control = data.dot(mahalanobis_transform.transpose())
 83 |         if test_data:
 84 |             y_test = test_data.dot(mahalanobis_transform.transpose())
 85 |             return {"control": y_control, "test": y_test}
 86 |         return {"control": y_control}
 87 | 
 88 |     @classmethod
 89 |     def calc(
 90 |         cls,
 91 |         data: Dataset,
 92 |         group_field: Sequence[str] | str | None = None,
 93 |         grouping_data: list[tuple[str, Dataset]] | None = None,
 94 |         target_fields: str | list[str] | None = None,
 95 |         **kwargs,
 96 |     ) -> dict:
 97 |         group_field = Adapter.to_list(group_field)
 98 | 
 99 |         if grouping_data is None:
100 |             grouping_data = data.groupby(group_field)
101 |         if len(grouping_data) > 1:
102 |             grouping_data[0][1].tmp_roles = data.tmp_roles
103 |         else:
104 |             raise NotSuitableFieldError(group_field, "Grouping")
105 |         return cls._execute_inner_function(
106 |             grouping_data, target_fields=target_fields, old_data=data, **kwargs
107 |         )
108 | 
109 |     def execute(self, data: ExperimentData) -> ExperimentData:
110 |         group_field, target_fields = self._get_fields(data=data)
111 |         self.key = str(
112 |             target_fields[0] if len(target_fields) == 1 else (target_fields or "")
113 |         )
114 |         if (
115 |             not target_fields and data.ds.tmp_roles
116 |         ):  # if the column is not suitable for the test, then the target will be empty, but if there is a role tempo, then this is normal behavior
117 |             return data
118 |         if group_field[0] in data.groups:  # TODO: to recheck if this is a correct check
119 |             grouping_data = list(data.groups[group_field[0]].items())
120 |         else:
121 |             grouping_data = None
122 |         t_data = deepcopy(data.ds)
123 |         if target_fields[1] not in t_data.columns:
124 |             t_data = t_data.add_column(
125 |                 data.additional_fields[target_fields[1]],
126 |                 role={target_fields[1]: TargetRole()},
127 |             )
128 |         compare_result = self.calc(
129 |             data=t_data,
130 |             group_field=group_field,
131 |             target_fields=target_fields,
132 |             grouping_data=grouping_data,
133 |         )
134 |         return self._set_value(data, compare_result)
135 | 


--------------------------------------------------------------------------------
/hypex/ab.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import Literal
  4 | 
  5 | from .analyzers.ab import ABAnalyzer
  6 | from .comparators import Chi2Test, GroupDifference, GroupSizes, TTest, UTest
  7 | from .dataset import TargetRole, TreatmentRole
  8 | from .experiments.base import Experiment, OnRoleExperiment
  9 | from .ui.ab import ABOutput
 10 | from .ui.base import ExperimentShell
 11 | from .utils import ABNTestMethodsEnum
 12 | 
 13 | 
 14 | class ABTest(ExperimentShell):
 15 |     """A class for conducting A/B tests with configurable statistical tests and multiple testing correction.
 16 | 
 17 |     This class provides functionality to run A/B tests with options for different statistical tests
 18 |     (t-test, u-test, chi-square test) and multiple testing correction methods.
 19 | 
 20 |     Args:
 21 |         additional_tests (Union[str, List[str], None], optional): Statistical test(s) to run in addition to
 22 |             the default group difference calculation. Valid options are "t-test", "u-test", and "chi2-test".
 23 |             Can be a single test name or list of test names. Defaults to ["t-test"].
 24 |         multitest_method (str, optional): Method to use for multiple testing correction. Valid options are:
 25 |             "bonferroni", "sidak", "holm-sidak", "holm", "simes-hochberg", "hommel", "fdr_bh", "fdr_by",
 26 |             "fdr_tsbh", "fdr_tsbhy", "quantile". Defaults to "holm".
 27 | 
 28 |             For more information refer to the statsmodels documentation:
 29 |             https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html
 30 | 
 31 |     Examples
 32 |     --------
 33 |     .. code-block:: python
 34 | 
 35 |         # Basic A/B test with default t-test
 36 |         ab_test = ABTest()
 37 |         results = ab_test.execute(data)
 38 | 
 39 |         # A/B test with multiple statistical tests
 40 |         ab_test = ABTest(
 41 |             additional_tests=["t-test", "chi2-test"],
 42 |             multitest_method="bonferroni"
 43 |         )
 44 |         results = ab_test.execute(data)
 45 |     """
 46 | 
 47 |     @staticmethod
 48 |     def _make_experiment(additional_tests, multitest_method):
 49 |         """Creates an experiment configuration with specified statistical tests.
 50 | 
 51 |         Args:
 52 |             Args:
 53 |         additional_tests (Union[str, List[str], None], optional): Statistical test(s) to run in addition to
 54 |             the default group difference calculation. Valid options are "t-test", "u-test", and "chi2-test".
 55 |             Can be a single test name or list of test names. Defaults to ["t-test"].
 56 |         multitest_method (str, optional): Method to use for multiple testing correction. Valid options are:
 57 |             "bonferroni", "sidak", "holm-sidak", "holm", "simes-hochberg", "hommel", "fdr_bh", "fdr_by",
 58 |             "fdr_tsbh", "fdr_tsbhy", "quantile". Defaults to "holm".
 59 |          For more information refer to the statsmodels documentation:
 60 |          <https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html>
 61 | 
 62 |         Returns:
 63 |             Experiment: Configured experiment object with specified tests and correction method.
 64 |         """
 65 |         test_mapping = {
 66 |             "t-test": TTest(compare_by="groups", grouping_role=TreatmentRole()),
 67 |             "u-test": UTest(compare_by="groups", grouping_role=TreatmentRole()),
 68 |             "chi2-test": Chi2Test(compare_by="groups", grouping_role=TreatmentRole()),
 69 |         }
 70 |         on_role_executors = [GroupDifference(grouping_role=TreatmentRole())]
 71 |         additional_tests = ["t-test"] if additional_tests is None else additional_tests
 72 |         additional_tests = (
 73 |             additional_tests
 74 |             if isinstance(additional_tests, list)
 75 |             else [additional_tests]
 76 |         )
 77 |         for i in additional_tests:
 78 |             on_role_executors += [test_mapping[i]]
 79 |         return Experiment(
 80 |             executors=[
 81 |                 GroupSizes(grouping_role=TreatmentRole()),
 82 |                 OnRoleExperiment(
 83 |                     executors=on_role_executors,
 84 |                     role=TargetRole(),
 85 |                 ),
 86 |                 ABAnalyzer(
 87 |                     multitest_method=(
 88 |                         ABNTestMethodsEnum(multitest_method)
 89 |                         if multitest_method
 90 |                         else None
 91 |                     )
 92 |                 ),
 93 |             ]
 94 |         )
 95 | 
 96 |     def __init__(
 97 |         self,
 98 |         additional_tests: (
 99 |             Literal["t-test", "u-test", "chi2-test"]
100 |             | list[Literal["t-test", "u-test", "chi2-test"]]
101 |             | None
102 |         ) = None,
103 |         multitest_method: (
104 |             Literal[
105 |                 "bonferroni",
106 |                 "sidak",
107 |                 "holm-sidak",
108 |                 "holm",
109 |                 "simes-hochberg",
110 |                 "hommel",
111 |                 "fdr_bh",
112 |                 "fdr_by",
113 |                 "fdr_tsbh",
114 |                 "fdr_tsbhy",
115 |                 "quantile",
116 |             ]
117 |             | None
118 |         ) = "holm",
119 |         t_test_equal_var: bool | None = None,
120 |     ):
121 |         super().__init__(
122 |             experiment=self._make_experiment(additional_tests, multitest_method),
123 |             output=ABOutput(),
124 |         )
125 |         if t_test_equal_var is not None:
126 |             self.experiment.set_params({TTest: {"calc_kwargs": {"equal_var": t_test_equal_var}}})
127 | 


--------------------------------------------------------------------------------
/hypex/reporters/abstract.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from abc import ABC, abstractmethod
  4 | from typing import Any
  5 | 
  6 | from ..dataset import Dataset, ExperimentData
  7 | from ..dataset.roles import InfoRole, ReportRole, TreatmentRole
  8 | from ..utils import ID_SPLIT_SYMBOL, ExperimentDataEnum
  9 | from ..utils.errors import AbstractMethodError
 10 | 
 11 | 
 12 | class Reporter(ABC):
 13 |     @abstractmethod
 14 |     def report(self, data: ExperimentData):
 15 |         raise AbstractMethodError
 16 | 
 17 | 
 18 | class DictReporter(Reporter, ABC):
 19 |     def __init__(self, front=True):
 20 |         self.front = front
 21 | 
 22 |     @staticmethod
 23 |     def extract_from_one_row_dataset(data: Dataset) -> dict[str, Any]:
 24 |         return {k: v[0] for k, v in data.to_dict()["data"]["data"].items()}
 25 | 
 26 |     def _extract_from_comparator(self, data: ExperimentData, comparator_id: str):
 27 |         result = {}
 28 |         field = comparator_id[comparator_id.rfind(ID_SPLIT_SYMBOL) + 1 :]
 29 |         executor_name = comparator_id[: comparator_id.find(ID_SPLIT_SYMBOL)]
 30 |         sep = " " if self.front else ID_SPLIT_SYMBOL
 31 |         analysis_dict = data.analysis_tables[comparator_id].to_dict()["data"]
 32 |         for i, index_value in enumerate(analysis_dict["index"]):
 33 |             for k, v in analysis_dict["data"].items():
 34 |                 key = sep.join(
 35 |                     [field, executor_name, k, str(index_value)]
 36 |                     if field
 37 |                     else [executor_name, k, str(index_value)]
 38 |                 )
 39 |                 result[key] = v[i]
 40 |         return result
 41 | 
 42 |     def _extract_from_comparators(
 43 |         self, data: ExperimentData, comparator_ids: list[str]
 44 |     ) -> dict[str, Any]:
 45 |         result = {}
 46 |         for comparator_id in comparator_ids:
 47 |             result.update(self._extract_from_comparator(data, comparator_id))
 48 |         return result
 49 | 
 50 |     @abstractmethod
 51 |     def report(self, data: ExperimentData) -> dict:
 52 |         raise AbstractMethodError
 53 | 
 54 | 
 55 | class OnDictReporter(Reporter, ABC):
 56 |     def __init__(self, dict_reporter: DictReporter) -> None:
 57 |         self.dict_reporter = dict_reporter
 58 | 
 59 | 
 60 | class DatasetReporter(OnDictReporter):
 61 |     def report(self, data: ExperimentData) -> dict[str, Dataset] | Dataset:
 62 |         dict_result = self.dict_reporter.report(data)
 63 |         return self.convert_to_dataset(
 64 |             dict_result
 65 |         )  #   TODO: change to DatasetAdapter.to_dataset()
 66 | 
 67 |     @staticmethod
 68 |     def convert_to_dataset(data: dict) -> dict[str, Dataset] | Dataset:
 69 |         return Dataset.from_dict(roles={k: ReportRole() for k in data}, data=[data])
 70 | 
 71 | 
 72 | class TestDictReporter(DictReporter):
 73 |     @staticmethod
 74 |     def _get_struct_dict(data: dict):
 75 |         dict_result = {}
 76 |         for key, value in data.items():
 77 |             if ID_SPLIT_SYMBOL in key:
 78 |                 key_split = key.split(ID_SPLIT_SYMBOL)
 79 |                 if key_split[2] in ("pass", "p-value", "difference", "difference %", "control mean", "test mean"):
 80 |                     if key_split[0] not in dict_result:
 81 |                         dict_result[key_split[0]] = {
 82 |                             key_split[3]: {key_split[1]: {key_split[2]: value}}
 83 |                         }
 84 |                     elif key_split[3] not in dict_result[key_split[0]]:
 85 |                         dict_result[key_split[0]][key_split[3]] = {
 86 |                             key_split[1]: {key_split[2]: value}
 87 |                         }
 88 |                     elif key_split[1] not in dict_result[key_split[0]][key_split[3]]:
 89 |                         dict_result[key_split[0]][key_split[3]][key_split[1]] = {
 90 |                             key_split[2]: value
 91 |                         }
 92 |                     else:
 93 |                         dict_result[key_split[0]][key_split[3]][key_split[1]][
 94 |                             key_split[2]
 95 |                         ] = value
 96 |         return dict_result
 97 | 
 98 |     @staticmethod
 99 |     def _convert_struct_dict_to_dataset(data: dict) -> Dataset:
100 |         def rename_passed(data: dict[str, bool]):
101 |             return {
102 |                 c: (
103 |                     ("NOT OK" if (v is True or v == "True") else "OK")
104 |                     if "pass" in c
105 |                     else v
106 |                 )
107 |                 for c, v in data.items()
108 |             }
109 | 
110 |         result = []
111 |         for feature, groups in data.items():
112 |             for group, tests in groups.items():
113 |                 t_values = {"feature": feature, "group": group}
114 |                 for test, values in tests.items():
115 |                     if test == "GroupDifference":
116 |                         t_values["control mean"] = values.get("control mean")
117 |                         t_values["test mean"] = values.get("test mean")
118 |                         t_values["difference"] = values.get("difference")
119 |                         t_values["difference %"] = values.get("difference %")
120 |                     else:
121 |                         t_values[f"{test} pass"] = values.get("pass")
122 |                         t_values[f"{test} p-value"] = values.get("p-value")
123 |                 result.append(t_values)
124 |         result = [rename_passed(d) for d in result]
125 |         return Dataset.from_dict(
126 |             result,
127 |             roles={"feature": InfoRole(), "group": TreatmentRole()},
128 |         )
129 | 
130 |     def extract_tests(self, data: ExperimentData) -> dict[str, Any]:
131 |         test_ids = data.get_ids(
132 |             self.tests, searched_space=ExperimentDataEnum.analysis_tables
133 |         )
134 |         result = {}
135 |         for class_, ids in test_ids.items():
136 |             result.update(
137 |                 self._extract_from_comparators(
138 |                     data, ids[ExperimentDataEnum.analysis_tables.value]
139 |                 )
140 |             )
141 |         return {k: v for k, v in result.items() if "pass" in k or "p-value" in k}
142 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  6 | 
  7 | # -- Path setup --------------------------------------------------------------
  8 | 
  9 | import datetime
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | import os
 16 | import sys
 17 | 
 18 | CURR_PATH = os.path.abspath(os.path.dirname(__file__))
 19 | LIB_PATH = os.path.join(CURR_PATH, os.path.pardir)
 20 | sys.path.insert(0, LIB_PATH)
 21 | 
 22 | project = "HypEx"
 23 | copyright = f"{datetime.datetime.now().year}, AI Lab ML Tools"
 24 | author = "AI Lab ML Tools"
 25 | 
 26 | os.environ["DOCUMENTATION_ENV"] = "True"
 27 | 
 28 | extensions = [
 29 |     "sphinx.ext.autodoc",
 30 |     "sphinx.ext.autosummary",  # will be used for tables
 31 |     "sphinx.ext.intersphinx",
 32 |     "sphinx.ext.napoleon",  # structure
 33 |     "sphinx.ext.viewcode",  # for [source] button
 34 |     "nbsphinx",
 35 |     "nbsphinx_link",
 36 |     "sphinx_autodoc_typehints",
 37 |     "IPython.sphinxext.ipython_console_highlighting",
 38 | ]
 39 | 
 40 | exclude_patterns = [
 41 |     "_build/*",
 42 |     "**.ipynb_checkpoints",
 43 |     "Thumbs.db",
 44 |     ".DS_Store",
 45 | ]
 46 | 
 47 | # Delete external references
 48 | autosummary_mock_imports = [
 49 |     "numpy",
 50 |     "pandas",
 51 |     "scipy",
 52 |     "sklearn",
 53 |     "networkx",
 54 |     "holidays",
 55 |     "joblib",
 56 |     "yaml",
 57 |     "gensim",
 58 |     "PIL",
 59 |     "albumentations",
 60 |     "tqdm",
 61 |     "matplotlib",
 62 |     "seaborn",
 63 |     "json2html",
 64 |     "faiss",
 65 |     "statsmodels",
 66 | ]
 67 | 
 68 | # Add any paths that contain templates here, relative to this directory.
 69 | templates_path = ["_templates"]
 70 | 
 71 | # -- Options for HTML output -------------------------------------------------
 72 | 
 73 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 74 | # a list of builtin themes.
 75 | html_theme = "sphinx_rtd_theme"
 76 | highlight_language = "python"
 77 | 
 78 | html_theme_options = {
 79 |     'logo_only': False,
 80 |     'prev_next_buttons_location': 'bottom',
 81 |     'style_external_links': True,
 82 |     'vcs_pageview_mode': 'blob',
 83 |     'style_nav_header_background': '#2980B9',
 84 |     # Toc options
 85 |     'collapse_navigation': True,
 86 |     'sticky_navigation': True,
 87 |     'navigation_depth': 4,
 88 |     'includehidden': True,
 89 |     'titles_only': False,
 90 |     'globaltoc_collapse': True,
 91 |     'globaltoc_maxdepth': 3,
 92 | }
 93 | 
 94 | # Add any paths that contain custom static files (such as style sheets) here,
 95 | # relative to this directory. They are copied after the builtin static files,
 96 | # so a file named "default.css" will overwrite the builtin "default.css".
 97 | html_static_path = ["_static"]
 98 | 
 99 | html_css_files = [
100 |     'custom.css',
101 | ]
102 | 
103 | html_show_sourcelink = False
104 | html_sidebars = {
105 |     '**': [
106 |         'globaltoc.html',
107 |         'relations.html',
108 |         'sourcelink.html',
109 |         'searchbox.html',
110 |     ]
111 | }
112 | 
113 | # code style
114 | pygments_style = "sphinx"
115 | 
116 | nbsphinx_execute = "never"
117 | 
118 | # autodoc
119 | # function names that will not be included in documentation
120 | EXCLUDED_MEMBERS = ",".join(
121 |     [
122 |         "get_own_record_history_wrapper",
123 |         "get_record_history_wrapper",
124 |         "record_history_omit",
125 |         "record_history_only",
126 |     ]
127 | )
128 | 
129 | autodoc_default_options = {
130 |     "ignore-module-all": True,
131 |     "show-inheritance": True,
132 |     "exclude-members": EXCLUDED_MEMBERS,
133 |     'inherited-members': False,
134 | }
135 | 
136 | # order of members in docs, usefully for methods in class
137 | autodoc_member_order = "bysource"
138 | 
139 | # typing, use in signature
140 | autodoc_typehints = "none"
141 | 
142 | # to omit some __init__ methods in classes where it not defined
143 | autoclass_content = "class"
144 | 
145 | # all warnings will be produced as errors
146 | autodoc_warningiserror = True
147 | 
148 | # when there is a link to function not use parentheses
149 | add_function_parentheses = False
150 | 
151 | # napoleon
152 | # in this docs google docstring format used
153 | napoleon_google_docstring = True
154 | napoleon_numpy_docstring = False
155 | 
156 | napoleon_include_init_with_doc = True
157 | 
158 | # to omit private members
159 | napoleon_include_private_with_doc = False
160 | 
161 | # use spectial members
162 | napoleon_include_special_with_doc = False
163 | 
164 | napoleon_use_param = True
165 | 
166 | # True to use a :keyword: role for each function keyword argument
167 | napoleon_use_keyword = True
168 | 
169 | # True to use the .. admonition:: directive for References sections instead .. rubric::
170 | napoleon_use_admonition_for_examples = True
171 | 
172 | # Autosummary true if you want to generate it from very beginning
173 | autosummary_generate = True
174 | 
175 | set_type_checking_flag = True
176 | 
177 | always_document_param_types = False
178 | 
179 | intersphinx_mapping = {
180 |     "python": ("https://docs.python.org/3", None),
181 |     "numpy": ("https://numpy.org/doc/stable", None),
182 |     "scipy": ("https://docs.scipy.org/doc/scipy/", None),
183 |     "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
184 |     "sklearn": ("https://scikit-learn.org/stable/", None),
185 |     "PIL": ("https://pillow.readthedocs.io/en/stable/", None),
186 | }
187 | 
188 | 
189 | # autodoc_type_aliases = {
190 | #     "RoleType": "lightautoml.dataset.roles.ColumnRole",
191 | #     "NpDataset": "lightautoml.text.utils.NpDataset",
192 | # }
193 | 
194 | 
195 | def skip_member(app, what, name, obj, skip, options):
196 |     if obj.__doc__ is None:
197 |         return True
198 |     return None
199 | 
200 | 
201 | def setup(app):
202 |     app.add_css_file("style.css")  # customizing default theme
203 |     app.connect("autodoc-skip-member", skip_member)
204 | 


--------------------------------------------------------------------------------
/hypex/analyzers/ab.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from copy import deepcopy
  4 | from typing import Any
  5 | 
  6 | from ..comparators import TTest, UTest
  7 | from ..dataset import Dataset, ExperimentData, StatisticRole, TargetRole, TreatmentRole
  8 | from ..experiments.base import Executor
  9 | from ..extensions.statsmodels import MultiTest, MultitestQuantile
 10 | from ..utils import (
 11 |     ID_SPLIT_SYMBOL,
 12 |     NAME_BORDER_SYMBOL,
 13 |     ABNTestMethodsEnum,
 14 |     BackendsEnum,
 15 |     ExperimentDataEnum,
 16 | )
 17 | 
 18 | 
 19 | class ABAnalyzer(Executor):
 20 |     def __init__(
 21 |         self,
 22 |         multitest_method: ABNTestMethodsEnum | None = None,
 23 |         alpha: float = 0.05,
 24 |         equal_variance: bool = True,
 25 |         quantiles: float | list[float] | None = None,
 26 |         iteration_size: int = 20000,
 27 |         random_state: int | None = None,
 28 |         key: Any = "",
 29 |     ):
 30 |         self.multitest_method = multitest_method
 31 |         self.alpha = alpha
 32 |         self.equal_variance = equal_variance
 33 |         self.quantiles = quantiles
 34 |         self.iteration_size = iteration_size
 35 |         self.random_state = random_state
 36 |         super().__init__(key)
 37 | 
 38 |     def _set_value(self, data: ExperimentData, value, key=None) -> ExperimentData:
 39 |         return data.set_value(
 40 |             ExperimentDataEnum.analysis_tables,
 41 |             self.id + key if key else self.id,
 42 |             value,
 43 |         )
 44 | 
 45 |     def execute_multitest(self, data: ExperimentData, p_values: Dataset, **kwargs):
 46 |         group_field = data.ds.search_columns(TreatmentRole())[0]
 47 |         target_fields = data.ds.search_columns(TargetRole(), search_types=[int, float])
 48 |         if self.multitest_method and len(data.groups[group_field]) > 2:
 49 |             if self.multitest_method != ABNTestMethodsEnum.quantile:
 50 |                 multitest_result = MultiTest(self.multitest_method).calc(
 51 |                     p_values, **kwargs
 52 |                 )
 53 |                 groups = []
 54 |                 for i in list(data.groups[group_field].keys())[1:]:
 55 |                     groups += [i] * len(target_fields)
 56 |                 multitest_result = multitest_result.add_column(
 57 |                     groups
 58 |                     * (
 59 |                         len(multitest_result)
 60 |                         // len(target_fields)
 61 |                         // (len(data.groups[group_field]) - 1)
 62 |                     ),
 63 |                     role={"group": StatisticRole()},
 64 |                 )
 65 | 
 66 |             else:
 67 |                 multitest_result = Dataset.create_empty()
 68 |                 for target_field in target_fields:
 69 |                     multitest_result = multitest_result.append(
 70 |                         MultitestQuantile(
 71 |                             self.alpha,
 72 |                             self.iteration_size,
 73 |                             self.equal_variance,
 74 |                             self.random_state,
 75 |                         ).calc(
 76 |                             p_values,
 77 |                             group_field=group_field,
 78 |                             target_field=target_field,
 79 |                             quantiles=self.quantiles,
 80 |                         )
 81 |                     )
 82 |             return self._set_value(data, multitest_result, key="MultiTest")
 83 |         return data
 84 | 
 85 |     def _add_pvalues(self, multitest_pvalues, value, field):
 86 |         if (
 87 |             self.multitest_method
 88 |             and field == "p-value"
 89 |             and self.multitest_method != "quantile"
 90 |         ):
 91 |             multitest_pvalues = multitest_pvalues.append(value)
 92 |         return multitest_pvalues
 93 | 
 94 |     def execute(self, data: ExperimentData) -> ExperimentData:
 95 |         executor_ids = data.get_ids([TTest, UTest])
 96 |         num_groups = len(data.groups[data.ds.search_columns(TreatmentRole())[0]]) - 1
 97 |         groups = list(data.groups[data.ds.search_columns(TreatmentRole())[0]].items())
 98 |         multitest_pvalues = Dataset.create_empty()
 99 |         analysis_data = {}
100 |         for c, spaces in executor_ids.items():
101 |             analysis_ids = spaces.get("analysis_tables", [])
102 |             if len(analysis_ids) == 0:
103 |                 continue
104 |             t_data = deepcopy(data.analysis_tables[analysis_ids[0]])
105 |             for aid in analysis_ids[1:]:
106 |                 t_data = t_data.append(data.analysis_tables[aid])
107 |             if len(analysis_ids) < len(t_data):
108 |                 analysis_ids *= num_groups
109 |             t_data.data.index = analysis_ids
110 |             for f in ["p-value", "pass"]:
111 |                 for i in range(0, len(analysis_ids), len(analysis_ids) // num_groups):
112 |                     value = t_data.iloc[i : i + len(analysis_ids) // num_groups][f]
113 |                     multitest_pvalues = self._add_pvalues(multitest_pvalues, value, f)
114 |                     analysis_data[f"{c} {f} {groups[i // num_groups + 1][0]}"] = (
115 |                         value.mean()
116 |                     )
117 |             if c not in ["UTest", "TTest"]:
118 |                 indexes = t_data.index
119 |                 values = t_data.data.values.tolist()
120 |                 for idx, value in zip(indexes, values):
121 |                     name = idx.split(ID_SPLIT_SYMBOL)[-1]
122 |                     analysis_data[
123 |                         f"{c} {name[name.find(NAME_BORDER_SYMBOL) + 1 : name.rfind(NAME_BORDER_SYMBOL)]}"
124 |                     ] = value[0]
125 | 
126 |         analysis_dataset = Dataset.from_dict(
127 |             [analysis_data],
128 |             {f: StatisticRole() for f in analysis_data},
129 |             BackendsEnum.pandas,
130 |         )
131 |         data = self.execute_multitest(
132 |             data,
133 |             (
134 |                 multitest_pvalues
135 |                 if not multitest_pvalues.is_empty()
136 |                 and self.multitest_method != ABNTestMethodsEnum.quantile
137 |                 else data.ds
138 |             ),
139 |         )
140 | 
141 |         return self._set_value(data, analysis_dataset)
142 | 


--------------------------------------------------------------------------------
/hypex/ml/faiss.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import Any, Literal
  4 | 
  5 | from ..comparators.distances import MahalanobisDistance
  6 | from ..dataset import (
  7 |     ABCRole,
  8 |     AdditionalMatchingRole,
  9 |     Dataset,
 10 |     ExperimentData,
 11 |     FeatureRole,
 12 | )
 13 | from ..executor import MLExecutor
 14 | from ..extensions.faiss import FaissExtension
 15 | from ..utils import ExperimentDataEnum
 16 | from ..utils.errors import PairsNotFoundError
 17 | 
 18 | 
 19 | class FaissNearestNeighbors(MLExecutor):
 20 |     def __init__(
 21 |         self,
 22 |         n_neighbors: int = 1,
 23 |         two_sides: bool = False,
 24 |         test_pairs: bool = False,
 25 |         grouping_role: ABCRole | None = None,
 26 |         key: Any = "",
 27 |         faiss_mode: Literal["base", "fast", "auto"] = "auto",
 28 |     ):
 29 |         self.n_neighbors = n_neighbors
 30 |         self.two_sides = two_sides
 31 |         self.test_pairs = test_pairs
 32 |         self.faiss_mode = faiss_mode
 33 |         super().__init__(
 34 |             grouping_role=grouping_role, target_role=FeatureRole(), key=key
 35 |         )
 36 | 
 37 |     @classmethod
 38 |     def _execute_inner_function(
 39 |         cls,
 40 |         grouping_data,
 41 |         target_field: str | None = None,
 42 |         n_neighbors: int | None = None,
 43 |         two_sides: bool | None = None,
 44 |         test_pairs: bool | None = None,
 45 |         faiss_mode: Literal["base", "fast", "auto"] = "auto",
 46 |         **kwargs,
 47 |     ) -> dict:
 48 |         if test_pairs is not True:
 49 |             data = cls._inner_function(
 50 |                 data=grouping_data[0][1],
 51 |                 test_data=grouping_data[1][1],
 52 |                 n_neighbors=n_neighbors or 1,
 53 |                 faiss_mode=faiss_mode,
 54 |                 **kwargs,
 55 |             )
 56 |             if two_sides is not True:
 57 |                 return {"test": data}
 58 |             return {
 59 |                 "test": data,
 60 |                 "control": cls._inner_function(
 61 |                     data=grouping_data[1][1],
 62 |                     test_data=grouping_data[0][1],
 63 |                     n_neighbors=n_neighbors or 1,
 64 |                     faiss_mode=faiss_mode,
 65 |                     **kwargs,
 66 |                 ),
 67 |             }
 68 |         data = cls._inner_function(
 69 |             data=grouping_data[1][1],
 70 |             test_data=grouping_data[0][1],
 71 |             n_neighbors=n_neighbors or 1,
 72 |             faiss_mode=faiss_mode,
 73 |             **kwargs,
 74 |         )
 75 |         if two_sides is not True:
 76 |             return {"control": data}
 77 |         return {
 78 |             "control": data,
 79 |             "test": cls._inner_function(
 80 |                 data=grouping_data[1][1],
 81 |                 test_data=grouping_data[0][1],
 82 |                 n_neighbors=n_neighbors or 1,
 83 |                 faiss_mode=faiss_mode,
 84 |                 **kwargs,
 85 |             ),
 86 |         }
 87 | 
 88 |     @classmethod
 89 |     def _inner_function(
 90 |         cls,
 91 |         data: Dataset,
 92 |         test_data: Dataset | None = None,
 93 |         target_data: Dataset | None = None,
 94 |         n_neighbors: int | None = None,
 95 |         faiss_mode: Literal["base", "fast", "auto"] = "auto",
 96 |         **kwargs,
 97 |     ) -> Any:
 98 |         return FaissExtension(n_neighbors=n_neighbors or 1, faiss_mode=faiss_mode).calc(
 99 |             data=data, test_data=test_data
100 |         )
101 | 
102 |     def fit(self, X: Dataset, Y: Dataset | None = None) -> MLExecutor:
103 |         return FaissExtension(self.n_neighbors, self.faiss_mode).fit(X=X, Y=Y)
104 | 
105 |     def predict(self, X: Dataset) -> Dataset:
106 |         return FaissExtension().predict(X)
107 | 
108 |     def execute(self, data: ExperimentData) -> ExperimentData:
109 |         group_field, features_fields = self._get_fields(data=data)
110 |         if group_field[0] in data.groups:
111 |             grouping_data = list(data.groups[group_field[0]].items())
112 |         else:
113 |             grouping_data = data.ds.groupby(group_field, fields_list=features_fields)
114 |         distances_keys = data.get_ids(MahalanobisDistance, ExperimentDataEnum.groups)
115 |         if len(distances_keys["MahalanobisDistance"]["groups"]) > 0:
116 |             grouping_data = list(
117 |                 data.groups[distances_keys["MahalanobisDistance"]["groups"][0]].items()
118 |             )
119 |         compare_result = self.calc(
120 |             data=data.ds,
121 |             group_field=group_field,
122 |             grouping_data=grouping_data,
123 |             features_fields=features_fields,
124 |             n_neighbors=self.n_neighbors,
125 |             faiss_mode=self.faiss_mode,
126 |             two_sides=self.two_sides,
127 |             test_pairs=self.test_pairs,
128 |         )
129 |         ds = data.ds.groupby(group_field)
130 |         matched_indexes = Dataset.create_empty()
131 |         for i in range(len(compare_result.columns)):
132 |             group = (
133 |                 grouping_data[1][1]
134 |                 if compare_result.columns[i] == "test"
135 |                 else grouping_data[0][1]
136 |             )
137 |             t_ds = ds[0][1] if compare_result.columns[i] == "test" else ds[1][1]
138 |             t_index_field = (
139 |                 compare_result[compare_result.columns[i]]
140 |                 .loc[: len(group) - 1]
141 |                 .rename({compare_result.columns[i]: "indexes"})
142 |             )
143 |             if t_index_field.isna().sum() > 0:
144 |                 raise PairsNotFoundError
145 |             matched_indexes = matched_indexes.append(
146 |                 Dataset.from_dict(
147 |                     data={
148 |                         "indexes": t_ds.iloc[
149 |                             list(map(lambda x: int(x[0]), t_index_field.get_values()))
150 |                         ].index
151 |                     },
152 |                     roles={"indexes": AdditionalMatchingRole()},
153 |                     index=group.index,
154 |                 )
155 |             ).sort()
156 |         if len(matched_indexes) < len(data.ds) and not self.two_sides:
157 |             matched_indexes = matched_indexes.reindex(data.ds.index, fill_value=-1)
158 |         elif len(matched_indexes) < len(data.ds) and self.two_sides:
159 |             raise PairsNotFoundError
160 |         return self._set_value(data, matched_indexes, key="matched")
161 | 


--------------------------------------------------------------------------------
/hypex/extensions/scipy_stats.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import warnings
  4 | from typing import Callable
  5 | 
  6 | from scipy.stats import (  # type: ignore
  7 |     chi2_contingency,
  8 |     ks_2samp,
  9 |     mannwhitneyu,
 10 |     norm,
 11 |     ttest_ind,
 12 | )
 13 | 
 14 | from ..dataset import Dataset, DatasetAdapter, StatisticRole
 15 | from .abstract import CompareExtension
 16 | 
 17 | 
 18 | class StatTest(CompareExtension):
 19 |     def __init__(
 20 |         self, test_function: Callable | None = None, reliability: float = 0.05
 21 |     ):
 22 |         super().__init__()
 23 |         self.test_function = test_function
 24 |         self.reliability = reliability
 25 | 
 26 |     @staticmethod  # TODO: remove
 27 |     def check_other(other: Dataset | None) -> Dataset:
 28 |         if other is None:
 29 |             raise ValueError("No other dataset provided")
 30 |         return other
 31 | 
 32 |     @staticmethod
 33 |     def check_dataset(data: Dataset):
 34 |         if len(data.columns) != 1:
 35 |             raise ValueError("Data must be one-dimensional")
 36 | 
 37 |     def check_data(self, data: Dataset, other: Dataset | None) -> Dataset:
 38 |         other = self.check_other(other)
 39 | 
 40 |         self.check_dataset(data)
 41 |         self.check_dataset(other)
 42 | 
 43 |         return other
 44 | 
 45 |     def _calc_pandas(
 46 |         self, data: Dataset, other: Dataset | None = None, **kwargs
 47 |     ) -> Dataset | float:
 48 |         other = self.check_data(data, other)
 49 |         if self.test_function is None:
 50 |             raise ValueError("test_function is needed for execution")
 51 |         one_result = self.test_function(
 52 |             data.backend.data.values.flatten(),
 53 |             other.backend.data.values.flatten(),
 54 |             **kwargs,
 55 |         )
 56 |         one_result = DatasetAdapter.to_dataset(
 57 |             {
 58 |                 "p-value": one_result.pvalue,
 59 |                 "statistic": one_result.statistic,
 60 |                 "pass": one_result.pvalue < self.reliability,
 61 |             },
 62 |             StatisticRole(),
 63 |         )
 64 |         return one_result
 65 | 
 66 | 
 67 | class TTestExtension(StatTest):
 68 |     def __init__(self, reliability: float = 0.05):
 69 |         super().__init__(ttest_ind, reliability=reliability)
 70 | 
 71 |     def _calc_pandas(
 72 |         self, data: Dataset, other: Dataset | None = None, **kwargs
 73 |     ) -> Dataset | float:
 74 |         # if (
 75 |         #     next(iter(data.nunique().values()))
 76 |         #     and next(iter(other.nunique().values())) < 2
 77 |         # ):
 78 |         #     return DatasetAdapter.to_dataset(
 79 |         #         {
 80 |         #             "p-value": [None],
 81 |         #             "statistic": [None],
 82 |         #             "pass": [None],
 83 |         #         },
 84 |         #         StatisticRole(),
 85 |         #     )
 86 |         return super()._calc_pandas(data, other, nan_policy="omit", **kwargs)
 87 | 
 88 | 
 89 | class KSTestExtension(StatTest):
 90 |     def __init__(self, reliability: float = 0.05):
 91 |         super().__init__(ks_2samp, reliability=reliability)
 92 | 
 93 | 
 94 | class UTestExtension(StatTest):
 95 |     def __init__(self, reliability: float = 0.05):
 96 |         super().__init__(mannwhitneyu, reliability=reliability)
 97 | 
 98 | 
 99 | class Chi2TestExtension(StatTest):
100 |     @staticmethod
101 |     def mini_category_replace(counts: Dataset) -> Dataset:
102 |         mini_counts = counts["count"][counts["count"] < 7]
103 |         if len(mini_counts) > 0:
104 |             counts = counts.append(
105 |                 Dataset.from_dict(
106 |                     [{counts.columns[0]: "other", "count": mini_counts["count"].sum()}],
107 |                     roles=mini_counts.roles,
108 |                 )
109 |             )
110 |             counts = counts[counts["count"] >= 7]
111 |         return counts
112 | 
113 |     def matrix_preparation(self, data: Dataset, other: Dataset) -> Dataset | None:
114 |         proportion = len(data) / (len(data) + len(other))
115 |         counted_data = data.value_counts()
116 |         counted_data = self.mini_category_replace(counted_data)
117 |         data_vc = counted_data["count"] * (1 - proportion)
118 | 
119 |         counted_other = other.value_counts()
120 |         counted_other = self.mini_category_replace(counted_other)
121 |         other_vc = counted_other["count"] * proportion
122 | 
123 |         if len(counted_data) < 2:
124 |             return None
125 |         data_vc = data_vc.add_column(counted_data[counted_data.columns[0]])
126 |         other_vc = other_vc.add_column(counted_data[counted_data.columns[0]])
127 |         return data_vc.merge(other_vc, on=counted_data.columns[0])[
128 |             ["count_x", "count_y"]
129 |         ].fillna(0)
130 | 
131 |     def _calc_pandas(
132 |         self, data: Dataset, other: Dataset | None = None, **kwargs
133 |     ) -> Dataset | float:
134 |         other = self.check_data(data, other)
135 |         matrix = self.matrix_preparation(data, other)
136 |         if matrix is None:
137 |             warnings.warn(f"Matrix Chi2 is empty for {data.columns[0]}. Returning None")
138 |             return DatasetAdapter.to_dataset(
139 |                 {
140 |                     "p-value": [None],
141 |                     "statistic": [None],
142 |                     "pass": [None],
143 |                 },
144 |                 StatisticRole(),
145 |             )
146 |         one_result = chi2_contingency(matrix.backend.data)
147 |         return DatasetAdapter.to_dataset(
148 |             {
149 |                 "p-value": (
150 |                     one_result[1]
151 |                     if isinstance(one_result, tuple)
152 |                     else one_result.pvalue
153 |                 ),
154 |                 "statistic": (
155 |                     one_result[0]
156 |                     if isinstance(one_result, tuple)
157 |                     else one_result.statistic
158 |                 ),
159 |                 "pass": (
160 |                     one_result[1]
161 |                     if isinstance(one_result, tuple)
162 |                     else one_result.pvalue
163 |                 )
164 |                 < self.reliability,
165 |             },
166 |             StatisticRole(),
167 |         )
168 | 
169 | 
170 | class NormCDF(StatTest):
171 |     def _calc_pandas(
172 |         self, data: Dataset, other: Dataset | None = None, **kwargs
173 |     ) -> Dataset | float:
174 |         result = norm.cdf(abs(data.get_values()[0][0]))
175 |         return DatasetAdapter.to_dataset(
176 |             {"p-value": 2 * (1 - result)},
177 |             StatisticRole(),
178 |         )
179 | 


--------------------------------------------------------------------------------
/hypex/experiments/base_complex.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from itertools import product
  4 | from typing import Any, Sequence
  5 | 
  6 | from tqdm import tqdm
  7 | 
  8 | from ..dataset import ABCRole, Dataset, ExperimentData, GroupingRole
  9 | from ..executor import Executor, IfExecutor
 10 | from ..reporters import DatasetReporter, Reporter
 11 | from ..utils.enums import ExperimentDataEnum
 12 | from .base import Experiment
 13 | 
 14 | 
 15 | class ExperimentWithReporter(Experiment):
 16 |     def __init__(
 17 |         self,
 18 |         executors: Sequence[Executor],
 19 |         reporter: Reporter,
 20 |         transformer: bool | None = None,
 21 |         key: str = "",
 22 |     ):
 23 |         super().__init__(executors, transformer, key)
 24 |         self.reporter = reporter
 25 | 
 26 |     def one_iteration(
 27 |         self, data: ExperimentData, key: str = "", set_key_as_index: bool = False
 28 |     ):
 29 |         t_data = ExperimentData(data.ds)
 30 |         self.key = key
 31 |         t_data = super().execute(t_data)
 32 |         result = self.reporter.report(t_data)
 33 |         if set_key_as_index:
 34 |             result.index = [key]
 35 |         return result
 36 | 
 37 |     def _set_result(
 38 |         self, data: ExperimentData, result: list[Dataset], reset_index: bool = True
 39 |     ):
 40 |         result = (
 41 |             result[0].append(result[1:], reset_index=reset_index)
 42 |             if len(result) > 1
 43 |             else result[0]
 44 |         )
 45 |         return self._set_value(data, result)
 46 | 
 47 | 
 48 | class CycledExperiment(ExperimentWithReporter):
 49 |     def __init__(
 50 |         self,
 51 |         executors: list[Executor],
 52 |         reporter: DatasetReporter,
 53 |         n_iterations: int,
 54 |         transformer: bool | None = None,
 55 |         key: str = "",
 56 |     ):
 57 |         super().__init__(executors, reporter, transformer, key)
 58 |         self.n_iterations: int = n_iterations
 59 | 
 60 |     def generate_params_hash(self) -> str:
 61 |         return f"{self.reporter.__class__.__name__} x {self.n_iterations}"
 62 | 
 63 |     def execute(self, data: ExperimentData) -> ExperimentData:
 64 |         result: list[Dataset] = [
 65 |             self.one_iteration(data, str(i)) for i in tqdm(range(self.n_iterations))
 66 |         ]
 67 |         return self._set_result(data, result)
 68 | 
 69 | 
 70 | class GroupExperiment(ExperimentWithReporter):
 71 |     def __init__(
 72 |         self,
 73 |         executors: Sequence[Executor],
 74 |         reporter: Reporter,
 75 |         searching_role: ABCRole = GroupingRole(),
 76 |         transformer: bool | None = None,
 77 |         key: str = "",
 78 |     ):
 79 |         self.searching_role = searching_role
 80 |         super().__init__(executors, reporter, transformer, key)
 81 | 
 82 |     def generate_params_hash(self) -> str:
 83 |         return f"GroupExperiment: {self.reporter.__class__.__name__}"
 84 | 
 85 |     def execute(self, data: ExperimentData) -> ExperimentData:
 86 |         group_field = data.ds.search_columns(self.searching_role)
 87 |         result: list[Dataset] = [
 88 |             self.one_iteration(
 89 |                 ExperimentData(group_data), str(group[0]), set_key_as_index=True
 90 |             )
 91 |             for group, group_data in tqdm(data.ds.groupby(group_field))
 92 |         ]
 93 |         return self._set_result(data, result, reset_index=False)
 94 | 
 95 | 
 96 | class ParamsExperiment(ExperimentWithReporter):
 97 |     def __init__(
 98 |         self,
 99 |         executors: Sequence[Executor],
100 |         reporter: DatasetReporter,
101 |         params: dict[type, dict[str, Sequence[Any]]],
102 |         transformer: bool | None = None,
103 |         key: str = "",
104 |     ):
105 |         super().__init__(executors, reporter, transformer, key)
106 |         self._params = params
107 |         self._flat_params: list[dict[type, dict[str, Any]]] = []
108 | 
109 |     def generate_params_hash(self) -> str:
110 |         return f"ParamsExperiment: {self.reporter.__class__.__name__}"
111 | 
112 |     def _update_flat_params(self):
113 |         classes = list(self._params)
114 |         param_combinations = [
115 |             list(
116 |                 product(
117 |                     *[
118 |                         product([parameter], values)
119 |                         for parameter, values in class_params.items()
120 |                     ]
121 |                 )
122 |             )
123 |             for class_params in self._params.values()
124 |         ]
125 |         new_flat_params = [
126 |             {
127 |                 classes[i]: dict(param_combination[i])
128 |                 for i in range(len(param_combination))
129 |             }
130 |             for param_combination in product(*param_combinations)
131 |         ]
132 |         self._flat_params = new_flat_params
133 | 
134 |     @property
135 |     def flat_params(self) -> list[dict[type, dict[str, Any]]]:
136 |         return self._flat_params
137 | 
138 |     @property
139 |     def params(self) -> dict[type, dict[str, Sequence[Any]]]:
140 |         return self._params
141 | 
142 |     @params.setter
143 |     def params(self, params: dict[type, dict[str, Sequence[Any]]]):
144 |         self._params = params
145 |         self._update_flat_params()
146 | 
147 |     def execute(self, data: ExperimentData) -> ExperimentData:
148 |         results = []
149 |         self._update_flat_params()
150 |         for flat_param in tqdm(self._flat_params):
151 |             t_data = ExperimentData(data.ds)
152 |             for executor in self.executors:
153 |                 executor.set_params(flat_param)
154 |                 t_data = executor.execute(t_data)
155 |             report = self.reporter.report(t_data)
156 |             results.append(report)
157 |         return self._set_result(data, results)
158 | 
159 | 
160 | class IfParamsExperiment(ParamsExperiment):
161 |     def __init__(
162 |         self,
163 |         executors: Sequence[Executor],
164 |         reporter: DatasetReporter,
165 |         params: dict[type, dict[str, Sequence[Any]]],
166 |         stopping_criterion: IfExecutor,
167 |         transformer: bool | None = None,
168 |         key: str = "",
169 |     ):
170 |         self.stopping_criterion = stopping_criterion
171 |         super().__init__(executors, reporter, params, transformer, key)
172 | 
173 |     def execute(self, data: ExperimentData) -> ExperimentData:
174 |         self._update_flat_params()
175 |         for flat_param in tqdm(self._flat_params):
176 |             t_data = ExperimentData(data.ds)
177 |             for executor in self.executors:
178 |                 executor.set_params(flat_param)
179 |                 t_data = executor.execute(t_data)
180 |             if_result = self.stopping_criterion.execute(t_data)
181 |             if_executor_id = if_result.get_one_id(
182 |                 self.stopping_criterion.__class__, ExperimentDataEnum.variables
183 |             )
184 |             if if_result.variables[if_executor_id]["response"]:
185 |                 return self._set_result(data, [self.reporter.report(t_data)])
186 |         return data
187 | 


--------------------------------------------------------------------------------
/hypex/ui/base.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import Any
  4 | 
  5 | from ..dataset import Dataset, ExperimentData
  6 | from ..experiments.base import Experiment
  7 | from ..reporters import Reporter
  8 | from ..utils import ID_SPLIT_SYMBOL
  9 | from ..utils.enums import RenameEnum
 10 | 
 11 | 
 12 | class Output:
 13 |     """A class for handling experiment output reporting and formatting.
 14 | 
 15 |     This class manages the reporting and formatting of experiment results, allowing for both
 16 |     a primary resume report and additional custom reports.
 17 | 
 18 |     Attributes:
 19 |         resume (Dataset): The main summary report of the experiment results.
 20 |         _experiment_data (ExperimentData): Internal storage of the experiment data.
 21 | 
 22 |     Args:
 23 |         resume_reporter (Reporter): The main reporter that generates the resume output.
 24 |         additional_reporters (Optional[Dict[str, Reporter]]): Dictionary mapping attribute
 25 |             names to additional reporters for custom reporting. Defaults to None.
 26 | 
 27 |     Examples
 28 |     --------
 29 |     .. code-block:: python
 30 | 
 31 |         # Basic usage with just a resume reporter
 32 |         from my_reporters import MyResumeReporter
 33 |         output = Output(resume_reporter=MyResumeReporter())
 34 |         output.extract(experiment_data)
 35 |         print(output.resume)
 36 | 
 37 |         # Using additional custom reporters
 38 |         from my_reporters import StatsReporter, PlotReporter
 39 |         additional = {
 40 |             'statistics': StatsReporter(),
 41 |             'plots': PlotReporter()
 42 |         }
 43 |         output = Output(
 44 |             resume_reporter=MyResumeReporter(),
 45 |             additional_reporters=additional
 46 |         )
 47 |         output.extract(experiment_data)
 48 |         print(output.statistics)  # Access additional report
 49 |         print(output.plots)  # Access additional report
 50 |     """
 51 | 
 52 |     resume: Dataset
 53 |     _experiment_data: ExperimentData
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         resume_reporter: Reporter,
 58 |         additional_reporters: dict[str, Reporter] | None = None,
 59 |     ):
 60 |         self.resume_reporter = resume_reporter
 61 |         self.additional_reporters = additional_reporters or {}
 62 | 
 63 |     def _extract_by_reporters(self, experiment_data: ExperimentData):
 64 |         """Extracts reports from all configured reporters.
 65 | 
 66 |         Args:
 67 |             experiment_data (ExperimentData): The experiment data to generate reports from.
 68 |         """
 69 |         self.resume = self.resume_reporter.report(experiment_data)
 70 |         for attribute, reporter in self.additional_reporters.items():
 71 |             setattr(self, attribute, reporter.report(experiment_data))
 72 |         self._experiment_data = experiment_data
 73 | 
 74 |     @staticmethod
 75 |     def _replace_splitters(
 76 |         data: Dataset, mode: RenameEnum = RenameEnum.columns
 77 |     ) -> Dataset:
 78 |         result = data
 79 |         if mode in (RenameEnum.all, RenameEnum.columns):
 80 |             result = result.rename(
 81 |                 {c: c.replace(ID_SPLIT_SYMBOL, " ") for c in result.columns}
 82 |             )
 83 |         if mode in (RenameEnum.all, RenameEnum.index):
 84 |             result.index = [i.replace(ID_SPLIT_SYMBOL, " ") for i in result.index]
 85 |         return result
 86 | 
 87 |     def extract(self, experiment_data: ExperimentData):
 88 |         """Extracts and processes all reports from the experiment data.
 89 | 
 90 |         Args:
 91 |             experiment_data (ExperimentData): The experiment data to generate reports from.
 92 | 
 93 |         Examples
 94 |         --------
 95 |         .. code-block:: python
 96 | 
 97 |             output = Output(resume_reporter=MyReporter())
 98 |             output.extract(experiment_data)
 99 |             print(output.resume)  # Access the main report
100 |         """
101 |         self._extract_by_reporters(experiment_data)
102 | 
103 | 
104 | class ExperimentShell:
105 |     """Base class for experiment execution with configurable output handling.
106 | 
107 |     This class provides a shell for executing experiments with customizable parameters
108 |     and output formatting. It serves as a base class for specific experiment types
109 |     like A/B tests and A/A tests.
110 | 
111 |     Args:
112 |         experiment (Experiment): The experiment configuration to execute.
113 |         output (Output): Output handler that defines how results are formatted.
114 |         experiment_params (Optional[Dict[str, Any]], optional): Additional parameters
115 |             to configure the experiment. Defaults to None.
116 | 
117 |     Examples
118 |     --------
119 |     .. code-block:: python
120 | 
121 |         # Basic usage with default parameters
122 |         experiment = Experiment([...])  # Configure experiment
123 |         output = Output(resume_reporter=MyReporter())
124 |         shell = ExperimentShell(experiment, output)
125 |         results = shell.execute(data)
126 | 
127 |         # With custom experiment parameters
128 |         params = {
129 |             "random_state": 42,
130 |             "test_size": 0.3
131 |         }
132 |         shell = ExperimentShell(
133 |             experiment=experiment,
134 |             output=output,
135 |             experiment_params=params
136 |         )
137 |         results = shell.execute(data)
138 |     """
139 | 
140 |     def __init__(
141 |         self,
142 |         experiment: Experiment,
143 |         output: Output,
144 |         experiment_params: dict[str, Any] | None = None,
145 |     ):
146 |         if experiment_params:
147 |             experiment.set_params(experiment_params)
148 |         self._out = output
149 |         self._experiment = experiment
150 | 
151 |     @property
152 |     def experiment(self):
153 |         """Gets the configured experiment instance.
154 | 
155 |         Returns:
156 |             Experiment: The experiment configuration object.
157 |         """
158 |         return self._experiment
159 | 
160 |     def execute(self, data: Dataset | ExperimentData) -> Output:
161 |         """Executes the experiment on the provided data.
162 | 
163 |         Runs the configured experiment on the input data and formats the results
164 |         using the configured output handler.
165 | 
166 |         Args:
167 |             data (Union[Dataset, ExperimentData]): Input data for the experiment.
168 |                 Can be either a Dataset or ExperimentData instance.
169 | 
170 |         Returns:
171 |             Output: Formatted experiment results through the configured output handler.
172 | 
173 |         Examples
174 |         --------
175 |         .. code-block:: python
176 | 
177 |             shell = ExperimentShell(experiment, output)
178 |             dataset = Dataset(...)  # Your input data
179 |             results = shell.execute(dataset)
180 |             print(results.resume)  # Access formatted results
181 |         """
182 |         if isinstance(data, Dataset):
183 |             data = ExperimentData(data)
184 |         result_experiment_data = self._experiment.execute(data)
185 |         self._out.extract(result_experiment_data)
186 |         return self._out
187 | 


--------------------------------------------------------------------------------
/hypex/splitters/aa.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import Any
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | from ..dataset import (
  9 |     AdditionalTreatmentRole,
 10 |     Dataset,
 11 |     ExperimentData,
 12 |     StratificationRole,
 13 |     TreatmentRole,
 14 | )
 15 | from ..dataset.roles import ConstGroupRole
 16 | from ..executor import Calculator
 17 | from ..utils import ExperimentDataEnum
 18 | 
 19 | 
 20 | class AASplitter(Calculator):
 21 |     def __init__(
 22 |         self,
 23 |         control_size: float = 0.5,
 24 |         random_state: int | None = None,
 25 |         sample_size: float | None = None,
 26 |         constant_key: bool = True,
 27 |         save_groups: bool = True,
 28 |         key: Any = "",
 29 |     ):
 30 |         self.control_size = control_size
 31 |         self.random_state = random_state
 32 |         self._key = key
 33 |         self.constant_key = constant_key
 34 |         self.save_groups = save_groups
 35 |         self.sample_size = sample_size
 36 |         super().__init__(key)
 37 | 
 38 |     def _generate_params_hash(self):
 39 |         hash_parts: list[str] = []
 40 |         if self.control_size != 0.5:
 41 |             hash_parts.append(f"cs {self.control_size}")
 42 |         if self.random_state is not None:
 43 |             hash_parts.append(f"rs {self.random_state}")
 44 |         self._params_hash = "|".join(hash_parts)
 45 | 
 46 |     def init_from_hash(self, params_hash: str):
 47 |         hash_parts: list[str] = params_hash.split("|")
 48 |         for hash_part in hash_parts:
 49 |             if hash_part.startswith("cs"):
 50 |                 self.control_size = float(hash_part[hash_part.rfind(" ") + 1 :])
 51 |             elif hash_part.startswith("rs"):
 52 |                 self.random_state = int(hash_part[hash_part.rfind(" ") + 1 :])
 53 |         self._generate_id()
 54 | 
 55 |     @property
 56 |     def key(self) -> Any:
 57 |         return self._key
 58 | 
 59 |     @key.setter
 60 |     def key(self, value: Any):
 61 |         if not self.constant_key:
 62 |             self._key = value
 63 |             self._generate_id()
 64 | 
 65 |     def _set_value(self, data: ExperimentData, value, key=None) -> ExperimentData:
 66 |         data = data.set_value(
 67 |             ExperimentDataEnum.additional_fields,
 68 |             self._id,
 69 |             value,
 70 |             role=AdditionalTreatmentRole(),
 71 |         )
 72 | 
 73 |         if self.save_groups:
 74 |             data.groups[self.id] = {
 75 |                 group: data.ds.loc[group_data.index]
 76 |                 for group, group_data in data.additional_fields.groupby(self.id)
 77 |             }
 78 |         return data
 79 | 
 80 |     @staticmethod
 81 |     def _inner_function(
 82 |         data: Dataset,
 83 |         random_state: int | None = None,
 84 |         control_size: float = 0.5,
 85 |         sample_size: float | None = None,
 86 |         const_group_field: str | None = None,
 87 |         **kwargs,
 88 |     ) -> list[str]:
 89 |         sample_size = 1.0 if sample_size is None else sample_size
 90 |         control_indexes = []
 91 |         if const_group_field:
 92 |             const_data = dict(data.groupby(const_group_field))
 93 |             control_data = const_data.get("control")
 94 |             if control_data is not None:
 95 |                 control_indexes = list(control_data.index)
 96 |             const_size = sum(len(cd) for cd in const_data.values())
 97 |             control_size = (len(data) * control_size - const_size) / (
 98 |                 len(data) - const_size
 99 |             )
100 |         experiment_data = (
101 |             data[data[const_group_field].isna()] if const_group_field else data
102 |         )
103 |         experiment_data_index = experiment_data.sample(
104 |             frac=sample_size, random_state=random_state
105 |         ).index
106 |         addition_indexes = list(experiment_data_index)
107 |         edge = int(len(addition_indexes) * control_size)
108 |         control_indexes += addition_indexes[:edge]
109 | 
110 |         split_series = pd.Series(np.ones(data.data.shape[0], dtype="int"), index=data.data.index)
111 |         split_series[control_indexes] -= 1
112 |         split_series = split_series.map({0: "control", 1: "test"})
113 | 
114 |         return split_series.to_list()
115 | 
116 |     def execute(self, data: ExperimentData) -> ExperimentData:
117 |         const_group_fields = data.ds.search_columns(ConstGroupRole())
118 |         const_group_fields = (
119 |             const_group_fields[0] if len(const_group_fields) > 0 else None
120 |         )
121 |         result = self.calc(
122 |             data.ds,
123 |             random_state=self.random_state,
124 |             control_size=self.control_size,
125 |             sample_size=self.sample_size,
126 |             const_group_field=const_group_fields,
127 |         )
128 |         return self._set_value(
129 |             data,
130 |             result,
131 |         )
132 | 
133 | 
134 | class AASplitterWithStratification(AASplitter):
135 |     @staticmethod
136 |     def _inner_function(
137 |         data: Dataset,
138 |         random_state: int | None = None,
139 |         control_size: float = 0.5,
140 |         grouping_fields=None,
141 |         **kwargs,
142 |     ) -> list[str] | Dataset:
143 |         if not grouping_fields:
144 |             return AASplitter._inner_function(
145 |                 data, random_state, control_size, **kwargs
146 |             )
147 |         result = {"split": []}
148 |         index = []
149 |         for group, group_data in data.groupby(grouping_fields):
150 |             result["split"].extend(
151 |                 AASplitter._inner_function(group_data, random_state, control_size)
152 |             )
153 |             index.extend(list(group_data.index))
154 |         return Dataset.from_dict(result, index=index, roles={"split": TreatmentRole()})
155 | 
156 |     def execute(self, data: ExperimentData) -> ExperimentData:
157 |         grouping_fields = data.ds.search_columns(StratificationRole())
158 |         result = self.calc(
159 |             data.ds,
160 |             random_state=self.random_state,
161 |             control_size=self.control_size,
162 |             grouping_fields=grouping_fields,
163 |         )
164 |         if isinstance(result, Dataset):
165 |             result = result.replace_roles({"split": AdditionalTreatmentRole()})
166 |         return self._set_value(data, result)
167 | 
168 | 
169 | #
170 | # class AASplitterWithStratification(AASplitter):
171 | #     def __init__(
172 | #         self,
173 | #         control_size: float = 0.5,
174 | #         random_state: Optional[int] = None,
175 | # #         key: Any = "",
176 | #     ):
177 | #         super().__init__(control_size, random_state,  key)
178 | #
179 | #     def calc(self, data: Dataset):
180 | #         stratification_columns = data.get_columns_by_roles(StratificationRole())
181 | #
182 | #         groups = data.groupby(stratification_columns)
183 | #         result = Dataset._create_empty()
184 | #         for _, gd in groups:
185 | #             ged = ExperimentData(gd)
186 | #             ged = super().execute(ged)
187 | #
188 | #             result = ged if result is None else result.append(ged)
189 | #         return result["group"]
190 | 
191 | 
192 | # As idea
193 | # class SplitterAAMulti(ExperimentMulti):
194 | #     def execute(self, data):
195 | #         raise NotImplementedError
196 | 


--------------------------------------------------------------------------------
/hypex/reporters/aa.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import contextlib
  4 | from typing import Any, ClassVar
  5 | 
  6 | from ..comparators import Chi2Test, GroupDifference, GroupSizes, KSTest, TTest
  7 | from ..dataset import Dataset, ExperimentData, InfoRole, StatisticRole
  8 | from ..splitters import AASplitter, AASplitterWithStratification
  9 | from ..utils import ID_SPLIT_SYMBOL, ExperimentDataEnum, NotFoundInExperimentDataError
 10 | from .abstract import Reporter, TestDictReporter
 11 | 
 12 | 
 13 | class OneAADictReporter(TestDictReporter):
 14 |     tests: ClassVar[list] = [TTest, KSTest, Chi2Test]
 15 | 
 16 |     @staticmethod
 17 |     def convert_flat_dataset(data: dict) -> Dataset:
 18 |         struct_dict = OneAADictReporter._get_struct_dict(data)
 19 |         return OneAADictReporter._convert_struct_dict_to_dataset(struct_dict)
 20 | 
 21 |     @staticmethod
 22 |     def get_splitter_id(data: ExperimentData):
 23 |         for c in [AASplitter, AASplitterWithStratification]:
 24 |             with contextlib.suppress(NotFoundInExperimentDataError):
 25 |                 return data.get_one_id(c, ExperimentDataEnum.additional_fields)
 26 | 
 27 |     def extract_group_difference(self, data: ExperimentData) -> dict[str, Any]:
 28 |         group_difference_ids = data.get_ids(GroupDifference)[GroupDifference.__name__][
 29 |             ExperimentDataEnum.analysis_tables.value
 30 |         ]
 31 |         return self._extract_from_comparators(data, group_difference_ids)
 32 | 
 33 |     def extract_group_sizes(self, data: ExperimentData) -> dict[str, Any]:
 34 |         group_sizes_id = data.get_one_id(GroupSizes, ExperimentDataEnum.analysis_tables)
 35 |         return self._extract_from_comparators(data, [group_sizes_id])
 36 | 
 37 |     def extract_analyzer_data(self, data: ExperimentData) -> dict[str, Any]:
 38 |         analyzer_id = data.get_one_id(
 39 |             "OneAAStatAnalyzer", ExperimentDataEnum.analysis_tables
 40 |         )
 41 |         return self.extract_from_one_row_dataset(data.analysis_tables[analyzer_id])
 42 | 
 43 |     def extract_data_from_analysis_tables(self, data: ExperimentData) -> dict[str, Any]:
 44 |         result = {}
 45 |         result.update(self.extract_group_difference(data))
 46 |         # result.update(self.extract_group_sizes(data))
 47 |         result.update(self.extract_tests(data))
 48 |         result.update(self.extract_analyzer_data(data))
 49 |         if self.front:
 50 |             result = self.rename_passed(result)
 51 |         return result
 52 | 
 53 |     def report(self, data: ExperimentData) -> dict[str, Any]:
 54 |         result = {
 55 |             "splitter_id": self.get_splitter_id(data),
 56 |         }
 57 |         result.update(self.extract_data_from_analysis_tables(data))
 58 |         return result
 59 | 
 60 | 
 61 | class AADatasetReporter(OneAADictReporter):
 62 |     def report(self, data: ExperimentData):
 63 |         front_buffer = self.front
 64 |         self.front = False
 65 |         dict_report = super().report(data)
 66 |         self.front = front_buffer
 67 |         return self.convert_flat_dataset(dict_report)
 68 | 
 69 | 
 70 | class AAPassedReporter(Reporter):
 71 |     @staticmethod
 72 |     def _reformat_aa_score_table(table: Dataset) -> Dataset:
 73 |         result = {}
 74 |         for ind in table.index:
 75 |             splitted_index = ind.split(ID_SPLIT_SYMBOL)
 76 |             row_index = f"{splitted_index[0]}{ID_SPLIT_SYMBOL}{splitted_index[-1]}"
 77 |             value = table.get_values(ind, "pass")
 78 |             if row_index not in result:
 79 |                 result[row_index] = {splitted_index[1]: value}
 80 |             else:
 81 |                 result[row_index][splitted_index[1]] = value
 82 |         result = Dataset.from_dict(result, roles={}).transpose() * 1
 83 |         return result
 84 | 
 85 |     @staticmethod
 86 |     def _reformat_best_split_table(table: Dataset) -> Dataset:
 87 |         passed = table.loc[:, [c for c in table.columns if (c.endswith("pass"))]]
 88 |         new_index = table.apply(
 89 |             lambda x: f"{x['feature']}{ID_SPLIT_SYMBOL}{x['group']}",
 90 |             {"index": InfoRole()},
 91 |             axis=1,
 92 |         )
 93 |         passed.index = new_index.get_values(column="index")
 94 |         passed = passed.rename(
 95 |             names={c: c[: c.rfind("pass") - 1] for c in passed.columns}
 96 |         )
 97 |         passed = passed.replace("OK", 1).replace("NOT OK", 0)
 98 |         passed = passed.astype({c: int for c in passed.columns}, errors="ignore")
 99 |         return passed
100 | 
101 |     def _detect_pass(self, analyzer_tables: dict[str, Dataset]):
102 |         score_table = self._reformat_aa_score_table(analyzer_tables["aa score"])
103 |         best_split_table = self._reformat_best_split_table(
104 |             analyzer_tables["best split statistics"]
105 |         )
106 |         resume_table = score_table * best_split_table
107 |         resume_table = resume_table.apply(
108 |             lambda x: "OK" if x.sum() > 0 else "NOT OK",
109 |             axis=1,
110 |             role={"result": StatisticRole()},
111 |         )
112 |         result = score_table.merge(
113 |             best_split_table,
114 |             suffixes=(" aa test", " best split"),
115 |             left_index=True,
116 |             right_index=True,
117 |         )
118 |         result = result.merge(resume_table, left_index=True, right_index=True)
119 |         result.roles = {c: r.__class__(str) for c, r in result.roles.items()}
120 |         result = (
121 |             result.replace(0, "NOT OK")
122 |             .replace(1, "OK")
123 |             .replace("0", "NOT OK")
124 |             .replace("1", "OK")
125 |         )
126 |         splitted_index = [str(i).split(ID_SPLIT_SYMBOL) for i in result.index]
127 |         result.add_column([i[0] for i in splitted_index], role={"feature": InfoRole()})
128 |         result.add_column([i[1] for i in splitted_index], role={"group": InfoRole()})
129 |         result.index = range(len(splitted_index))
130 |         return result
131 | 
132 |     def report(self, data: ExperimentData) -> Dataset:
133 |         analyser_ids = data.get_ids(
134 |             "AAScoreAnalyzer", ExperimentDataEnum.analysis_tables
135 |         )
136 |         analyser_tables = {
137 |             id_[id_.rfind(ID_SPLIT_SYMBOL) + 1 :]: data.analysis_tables[id_]
138 |             for id_ in analyser_ids["AAScoreAnalyzer"][
139 |                 ExperimentDataEnum.analysis_tables.value
140 |             ]
141 |         }
142 |         if not analyser_tables["aa score"]:
143 |             print("AA test cannot be performed as none of the analyzers passed")
144 |             return None
145 |         result = self._detect_pass(analyser_tables)
146 |         stats_cols = ["feature", "group", "control mean", "test mean", "difference", "difference %"]
147 |         differences = analyser_tables["best split statistics"].loc[
148 |             :,
149 |             [
150 |                 col
151 |                 for col in stats_cols
152 |                 if col in analyser_tables["best split statistics"].columns
153 |             ],
154 |         ]
155 |         result = result.merge(differences, on=["feature", "group"], how="left")
156 |         result = result[
157 |             ["feature", "group"]
158 |             + [c for c in result.columns if c not in ["feature", "group"]]
159 |         ]
160 |         numeric_cols = ["control mean", "test mean", "difference", "difference %"]
161 |         for col in numeric_cols:
162 |             result.data[col] = result.data[col].astype(float).round(6)
163 |         return result
164 | 
165 | 
166 | class AABestSplitReporter(Reporter):
167 |     def report(self, data: ExperimentData):
168 |         best_split_id = next(
169 |             (c for c in data.additional_fields.columns if c.endswith("best")), []
170 |         )
171 |         markers = data.additional_fields.loc[:, best_split_id]
172 |         markers = markers.rename({markers.columns[0]: "split"})
173 |         return data.ds.merge(markers, left_index=True, right_index=True)
174 | 


--------------------------------------------------------------------------------
/tests/test_tutorials.py:
--------------------------------------------------------------------------------
  1 | # starts with HYPEX-dir: PYTHONPATH=$(pwd) pytest
  2 | import random
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import pandas.testing as pdt
  7 | import pytest
  8 | 
  9 | from hypex import AATest, ABTest, Matching
 10 | from hypex.dataset import (
 11 |     Dataset,
 12 |     FeatureRole,
 13 |     InfoRole,
 14 |     StratificationRole,
 15 |     TargetRole,
 16 |     TreatmentRole,
 17 | )
 18 | 
 19 | 
 20 | @pytest.fixture
 21 | def aa_data():
 22 |     return [
 23 |         Dataset(
 24 |             roles={
 25 |                 "user_id": InfoRole(int),
 26 |                 "treat": TreatmentRole(int),
 27 |                 "pre_spends": TargetRole(),
 28 |                 "post_spends": TargetRole(),
 29 |                 "gender": StratificationRole(str),
 30 |             },
 31 |             data="examples/tutorials/data.csv",
 32 |         ),
 33 |         Dataset(
 34 |             roles={
 35 |                 "user_id": InfoRole(int),
 36 |                 "treat": TreatmentRole(int),
 37 |                 "pre_spends": TargetRole(),
 38 |                 "post_spends": TargetRole(),
 39 |                 "gender": TargetRole(str),
 40 |             },
 41 |             data="examples/tutorials/data.csv",
 42 |         ),
 43 |     ]
 44 | 
 45 | 
 46 | @pytest.fixture
 47 | def ab_data():
 48 |     random.seed(7)
 49 |     data = Dataset(
 50 |         roles={
 51 |             "user_id": InfoRole(int),
 52 |             "treat": TreatmentRole(),
 53 |             "pre_spends": TargetRole(),
 54 |             "post_spends": TargetRole(),
 55 |             "gender": TargetRole(),
 56 |         },
 57 |         data="examples/tutorials/data.csv",
 58 |     )
 59 |     data["treat"] = [random.choice([0, 1, 2]) for _ in range(len(data))]
 60 |     return data
 61 | 
 62 | 
 63 | @pytest.fixture
 64 | def matching_data():
 65 |     data = Dataset(
 66 |         roles={
 67 |             "user_id": InfoRole(int),
 68 |             "treat": TreatmentRole(int),
 69 |             "post_spends": TargetRole(float),
 70 |         },
 71 |         data="examples/tutorials/data.csv",
 72 |         default_role=FeatureRole(),
 73 |     )
 74 |     data = data.fillna(method="bfill")
 75 |     return data
 76 | 
 77 | 
 78 | def test_aatest(aa_data):
 79 |     mapping = {
 80 |         "aa-casual": AATest(n_iterations=10),
 81 |         "aa-rs": AATest(random_states=[56, 72, 2, 43]),
 82 |         "aa-strat": AATest(random_states=[56, 72, 2, 43], stratification=True),
 83 |         "aa-sample": AATest(n_iterations=10, sample_size=0.3),
 84 |         "aa-cat_target": AATest(n_iterations=10),
 85 |         "aa-equal_var": AATest(n_iterations=10, t_test_equal_var=False),
 86 |     }
 87 | 
 88 |     mapping_resume = {
 89 |         "aa-casual": pd.DataFrame(
 90 |             {
 91 |                 "TTest aa test": {0: "OK", 1: "OK"},
 92 |                 "KSTest aa test": {0: "NOT OK", 1: "OK"},
 93 |                 "TTest best split": {0: "OK", 1: "OK"},
 94 |                 "KSTest best split": {0: "OK", 1: "OK"},
 95 |                 "result": {0: "OK", 1: "OK"},
 96 |             }
 97 |         ),
 98 |         "aa-rs": pd.DataFrame(
 99 |             {
100 |                 "TTest aa test": {0: "OK", 1: "OK"},
101 |                 "KSTest aa test": {0: "NOT OK", 1: "OK"},
102 |                 "TTest best split": {0: "OK", 1: "OK"},
103 |                 "KSTest best split": {0: "OK", 1: "OK"},
104 |                 "result": {0: "OK", 1: "OK"},
105 |             }
106 |         ),
107 |         "aa-strat": pd.DataFrame(
108 |             {
109 |                 "TTest aa test": {0: "OK", 1: "NOT OK"},
110 |                 "KSTest aa test": {0: "OK", 1: "NOT OK"},
111 |                 "TTest best split": {0: "OK", 1: "OK"},
112 |                 "KSTest best split": {0: "OK", 1: "OK"},
113 |                 "result": {0: "OK", 1: "NOT OK"},
114 |             }
115 |         ),
116 |         "aa-sample": pd.DataFrame(
117 |             {
118 |                 "TTest aa test": {0: "OK", 1: "OK"},
119 |                 "KSTest aa test": {0: "OK", 1: "OK"},
120 |                 "TTest best split": {0: "NOT OK", 1: "NOT OK"},
121 |                 "KSTest best split": {0: "OK", 1: "OK"},
122 |                 "result": {0: "OK", 1: "OK"},
123 |             }
124 |         ),
125 |         "aa-cat_target": pd.DataFrame(
126 |             {
127 |                 "TTest aa test": ["OK", "OK", np.nan],
128 |                 "KSTest aa test": ["NOT OK", "OK", np.nan],
129 |                 "Chi2Test aa test": [np.nan, np.nan, "OK"],
130 |                 "TTest best split": ["OK", "OK", np.nan],
131 |                 "KSTest best split": ["OK", "OK", np.nan],
132 |                 "Chi2Test best split": [np.nan, np.nan, "OK"],
133 |                 "result": ["OK", "OK", "OK"],
134 |             }
135 |         ),
136 |         "aa-equal_var": pd.DataFrame(
137 |             {
138 |                 "TTest aa test": {0: "OK", 1: "OK"},
139 |                 "KSTest aa test": {0: "NOT OK", 1: "OK"},
140 |                 "TTest best split": {0: "OK", 1: "OK"},
141 |                 "KSTest best split": {0: "OK", 1: "OK"},
142 |                 "result": {0: "OK", 1: "OK"},
143 |             }
144 |         ),
145 |     }
146 | 
147 |     for test_name in mapping.keys():
148 |         print(test_name)
149 |         if test_name == "aa-cat_target":
150 |             res = mapping[test_name].execute(aa_data[1])
151 |         else:
152 |             res = mapping[test_name].execute(aa_data[0])
153 |         actual_data = res.resume.data.iloc[:, 2:-4]
154 |         expected_data = mapping_resume[test_name]
155 |         pdt.assert_frame_equal(expected_data, actual_data, check_dtype=False)
156 | 
157 | 
158 | def test_abtest(ab_data):
159 |     mapping = {
160 |         "ab-casual": ABTest(),
161 |         "ab-additional": ABTest(additional_tests=["t-test", "u-test", "chi2-test"]),
162 |         "ab-n": ABTest(multitest_method="bonferroni"),
163 |     }
164 | 
165 |     mapping_resume = {
166 |         "ab-casual": pd.DataFrame(
167 |             {"TTest pass": {0: "NOT OK", 1: "NOT OK", 2: "NOT OK", 3: "NOT OK"}}
168 |         ),
169 |         "ab-additional": pd.DataFrame(
170 |             {
171 |                 "TTest pass": {
172 |                     0: "NOT OK",
173 |                     1: "NOT OK",
174 |                     2: "NOT OK",
175 |                     3: "NOT OK",
176 |                     4: 0,
177 |                     5: 0,
178 |                 },
179 |                 "UTest pass": {
180 |                     0: "NOT OK",
181 |                     1: "NOT OK",
182 |                     2: "NOT OK",
183 |                     3: "NOT OK",
184 |                     4: 0,
185 |                     5: 0,
186 |                 },
187 |                 "Chi2Test pass": {0: 0, 1: 0, 2: 0, 3: 0, 4: "NOT OK", 5: "NOT OK"},
188 |             }
189 |         ),
190 |         "ab-n": pd.DataFrame(
191 |             {"TTest pass": {0: "NOT OK", 1: "NOT OK", 2: "NOT OK", 3: "NOT OK"}}
192 |         ),
193 |     }
194 | 
195 |     for test_name in mapping.keys():
196 |         res = mapping[test_name].execute(ab_data)
197 |         actual_data = (
198 |             res.resume.data.fillna(0)
199 |             .apply(pd.to_numeric, errors="ignore")
200 |             .iloc[:, 6::2]
201 |         )
202 |         expected_data = mapping_resume[test_name]
203 |         pdt.assert_frame_equal(expected_data, actual_data, check_dtype=False)
204 | 
205 | 
206 | def test_matchingtest(matching_data):
207 |     mapping = {
208 |         "matching": Matching(),
209 |         "matching-atc": Matching(metric="atc"),
210 |         "matching-att": Matching(metric="att"),
211 |         "matching-l2": Matching(distance="l2", metric="att"),
212 |         "matching-faiss-auto": Matching(distance="l2", faiss_mode="auto"),
213 |         "matching-faiss_base": Matching(distance="mahalanobis", faiss_mode="base"),
214 |         "matching-n-neighbors": Matching(n_neighbors=2),
215 |     }
216 | 
217 |     for test_name in mapping.keys():
218 |         res = mapping[test_name].execute(matching_data)
219 |         actual_data = res.resume.data
220 |         assert actual_data.index.isin(["ATT", "ATC", "ATE"]).all()
221 |         assert all(
222 |             actual_data.iloc[:, :-1].dtypes.apply(
223 |                 lambda x: pd.api.types.is_numeric_dtype(x)
224 |             )
225 |         ), "Есть нечисловые колонки!"
226 | 


--------------------------------------------------------------------------------
/hypex/extensions/statsmodels.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import numpy as np
  4 | from scipy.stats import norm  # type: ignore
  5 | from statsmodels.stats.multitest import multipletests  # type: ignore
  6 | 
  7 | from ..dataset import Dataset, DatasetAdapter, StatisticRole
  8 | from ..utils import ID_SPLIT_SYMBOL, ABNTestMethodsEnum
  9 | from .abstract import Extension
 10 | 
 11 | 
 12 | class MultiTest(Extension):
 13 |     def __init__(self, method: ABNTestMethodsEnum, alpha: float = 0.05):
 14 |         self.method = method
 15 |         self.alpha = alpha
 16 |         super().__init__()
 17 | 
 18 |     def _calc_pandas(self, data: Dataset, **kwargs):
 19 |         p_values = data.data.values.flatten()
 20 |         new_pvalues = multipletests(
 21 |             p_values, method=self.method.value, alpha=self.alpha, **kwargs
 22 |         )
 23 |         return DatasetAdapter.to_dataset(
 24 |             {
 25 |                 "field": [i.split(ID_SPLIT_SYMBOL)[2] for i in data.index],
 26 |                 "test": [i.split(ID_SPLIT_SYMBOL)[0] for i in data.index],
 27 |                 "old p-value": p_values,
 28 |                 "new p-value": new_pvalues[1],
 29 |                 "correction": [
 30 |                     j / i if j != 0 else 0.0 for i, j in zip(new_pvalues[1], p_values)
 31 |                 ],
 32 |                 "rejected": new_pvalues[0],
 33 |             },
 34 |             StatisticRole(),
 35 |         )
 36 | 
 37 | 
 38 | class MultitestQuantile(Extension):
 39 |     def __init__(
 40 |         self,
 41 |         alpha: float = 0.05,
 42 |         iteration_size: int = 20000,
 43 |         equal_variance: bool = True,
 44 |         random_state: int | None = None,
 45 |     ):
 46 |         self.alpha = alpha
 47 |         self.iteration_size = iteration_size
 48 |         self.equal_variance = equal_variance
 49 |         self.random_state = random_state
 50 |         super().__init__()
 51 | 
 52 |     def _calc_pandas(self, data: Dataset, **kwargs):
 53 |         group_field = kwargs.get("group_field")
 54 |         target_field = kwargs.get("target_field")
 55 |         quantiles = kwargs.get("quantiles")
 56 |         num_samples = len(data.unique()[group_field])
 57 |         sample_size = len(data)
 58 |         grouped_data = data.groupby(by=group_field, fields_list=target_field)
 59 |         means = [sample[1].agg("mean") for sample in grouped_data]
 60 |         variances = [
 61 |             sample[1].agg("var") * sample_size / (sample_size - 1)
 62 |             for sample in grouped_data
 63 |         ]
 64 |         if num_samples != len(means) or num_samples != len(variances):
 65 |             num_samples = min(num_samples, len(means), len(variances))
 66 |         if type(quantiles) is float:
 67 |             quantiles = np.full(num_samples, quantiles).tolist()
 68 | 
 69 |         quantiles = quantiles or self.quantile_of_marginal_distribution(
 70 |             num_samples=num_samples,
 71 |             quantile_level=1 - self.alpha / num_samples,
 72 |             variances=variances,
 73 |         )
 74 |         for j in range(num_samples):
 75 |             min_t_value = np.inf
 76 |             for i in range(num_samples):
 77 |                 if i != j:
 78 |                     t_value = (
 79 |                         np.sqrt(sample_size)
 80 |                         * (means[j] - means[i])
 81 |                         / np.sqrt(variances[j] + variances[i])
 82 |                     )
 83 |                     min_t_value = min(min_t_value, t_value)
 84 |             if min_t_value > quantiles[j]:
 85 |                 return DatasetAdapter.to_dataset(
 86 |                     {"field": target_field, "accepted hypothesis": j + 1},
 87 |                     StatisticRole(),
 88 |                 )
 89 |         return DatasetAdapter.to_dataset(
 90 |             {"field": target_field, "accepted hypothesis": 0}, StatisticRole()
 91 |         )
 92 | 
 93 |     def quantile_of_marginal_distribution(
 94 |         self,
 95 |         num_samples: int,
 96 |         quantile_level: float,
 97 |         variances: list[float] | None = None,
 98 |     ) -> list[float]:
 99 |         if variances is None:
100 |             self.equal_variance = True
101 |         num_samples_hyp = 1 if self.equal_variance else num_samples
102 |         quantiles = []
103 |         for j in range(num_samples_hyp):
104 |             t_values = []
105 |             random_samples = norm.rvs(
106 |                 size=[self.iteration_size, num_samples], random_state=self.random_state
107 |             )
108 |             for sample in random_samples:
109 |                 min_t_value = np.inf
110 |                 for i in range(num_samples):
111 |                     if i != j:
112 |                         if self.equal_variance:
113 |                             t_value = (sample[j] - sample[i]) / np.sqrt(2)
114 |                         else:
115 |                             if variances is None:
116 |                                 raise ValueError("variances is needed for execution")
117 |                             t_value = sample[j] / np.sqrt(
118 |                                 1 + variances[i] / variances[j]
119 |                             ) - sample[i] / np.sqrt(1 + variances[j] / variances[i])
120 |                         min_t_value = min(min_t_value, t_value)
121 |                 t_values.append(min_t_value)
122 |             quantiles.append(np.quantile(t_values, quantile_level))
123 |         return (
124 |             np.full(num_samples, quantiles[0]).tolist()
125 |             if self.equal_variance
126 |             else quantiles
127 |         )
128 | 
129 |     def min_sample_size(
130 |         self,
131 |         num_samples: int,
132 |         mde: float,
133 |         variances: list[float] | float,
134 |         power: float = 0.2,
135 |         quantile_1: float | list[float] | None = None,
136 |         quantile_2: float | list[float] | None = None,
137 |         initial_estimate: int = 0,
138 |         iteration_size: int = 3000,
139 |     ):
140 |         if isinstance(quantile_1, float):
141 |             quantile_1 = np.full(num_samples, quantile_1).tolist()
142 |         if isinstance(quantile_1, float):
143 |             quantile_2 = np.full(num_samples, quantile_2).tolist()
144 | 
145 |         quantile_1 = quantile_1 or self.quantile_of_marginal_distribution(
146 |             num_samples=num_samples,
147 |             quantile_level=1 - self.alpha / num_samples,
148 |             variances=variances if isinstance(variances, list) else [variances],
149 |         )
150 |         quantile_2 = quantile_2 or self.quantile_of_marginal_distribution(
151 |             num_samples=num_samples, quantile_level=power
152 |         )
153 | 
154 |         if self.equal_variance:
155 |             return int(2 * variances * ((quantile_1[0] - quantile_2[0]) / mde) ** 2) + 1
156 |         else:
157 |             sizes = []
158 |             for index in range(num_samples):
159 |                 size = initial_estimate
160 |                 current_power = 0
161 |                 while current_power < 1 - power:
162 |                     size += 100
163 |                     current_power = 0
164 |                     total_samples = norm.rvs(
165 |                         size=[iteration_size, num_samples],
166 |                         random_state=self.random_state,
167 |                     )
168 |                     for sample in total_samples:
169 |                         min_t_value = np.inf
170 |                         for i in range(num_samples):
171 |                             if i != index:
172 |                                 t_value = (
173 |                                     sample[index]
174 |                                     / np.sqrt(1 + variances[i] / variances[index])
175 |                                     - sample[i]
176 |                                     / np.sqrt(1 + variances[index] / variances[i])
177 |                                     + mde
178 |                                     * np.sqrt(size / (variances[index] + variances[i]))
179 |                                 )
180 |                                 min_t_value = min(min_t_value, t_value)
181 |                         if min_t_value > quantile_1[index]:
182 |                             current_power += 1
183 |                     current_power /= iteration_size
184 |                 sizes.append(size)
185 |             return {"min sample size": np.max(sizes)}
186 | 


--------------------------------------------------------------------------------
/hypex/dataset/abstract.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import copy
  4 | import json  # type: ignore
  5 | from abc import ABC
  6 | from typing import Any, Iterable
  7 | 
  8 | import pandas as pd  # type: ignore
  9 | 
 10 | from ..utils import BackendsEnum, RoleColumnError
 11 | from .backends import PandasDataset
 12 | from .roles import ABCRole, DefaultRole, default_roles
 13 | 
 14 | 
 15 | def parse_roles(roles: dict) -> dict[str | int] | ABCRole:
 16 |     new_roles = {}
 17 |     roles = roles or {}
 18 |     for role in roles:
 19 |         r = default_roles.get(role, role)
 20 |         if isinstance(roles[role], list):
 21 |             for i in roles[role]:
 22 |                 new_roles[i] = copy.deepcopy(r)
 23 |         else:
 24 |             new_roles[roles[role]] = copy.deepcopy(r)
 25 |     return new_roles or roles
 26 | 
 27 | 
 28 | class DatasetBase(ABC):
 29 |     @staticmethod
 30 |     def _select_backend_from_data(data):
 31 |         return PandasDataset(data)
 32 | 
 33 |     @staticmethod
 34 |     def _select_backend_from_str(data, backend):
 35 |         if backend == BackendsEnum.pandas:
 36 |             return PandasDataset(data)
 37 |         if backend is None:
 38 |             return PandasDataset(data)
 39 |         raise TypeError("Backend must be an instance of BackendsEnum")
 40 | 
 41 |     def _set_all_roles(self, roles):
 42 |         keys = list(roles.keys())
 43 |         for column in self.columns:
 44 |             if column not in keys:
 45 |                 roles[column] = copy.deepcopy(self.default_role) or DefaultRole()
 46 |         return roles
 47 | 
 48 |     def _set_empty_types(self, roles):
 49 |         for column, role in roles.items():
 50 |             if role.data_type is None:
 51 |                 role.data_type = self._backend.get_column_type(column)
 52 |             self._backend = self._backend.update_column_type(column, role.data_type)
 53 | 
 54 |     def __init__(
 55 |         self,
 56 |         roles: dict[ABCRole, list[str] | str] | dict[str, ABCRole],
 57 |         data: pd.DataFrame | str | None = None,
 58 |         backend: BackendsEnum | None = None,
 59 |         default_role: ABCRole | None = None,
 60 |     ):
 61 |         self._backend = (
 62 |             self._select_backend_from_str(data, backend)
 63 |             if backend
 64 |             else self._select_backend_from_data(data)
 65 |         )
 66 |         self.default_role = default_role
 67 |         roles = (
 68 |             parse_roles(roles)
 69 |             if any(isinstance(role, ABCRole) for role in roles.keys())
 70 |             else roles
 71 |         )
 72 |         if any(not isinstance(role, ABCRole) for role in roles.values()):
 73 |             raise TypeError("Roles must be instances of ABCRole type")
 74 |         if data is not None and any(
 75 |             i not in self._backend.columns for i in list(roles.keys())
 76 |         ):
 77 |             raise RoleColumnError(list(roles.keys()), self._backend.columns)
 78 |         if data is not None:
 79 |             roles = self._set_all_roles(roles)
 80 |             self._set_empty_types(roles)
 81 |         self._roles: dict[str, ABCRole] = roles
 82 |         self._tmp_roles: (
 83 |             dict[ABCRole, list[str] | str] | dict[list[str] | str] | ABCRole
 84 |         ) = {}
 85 | 
 86 |     def __repr__(self):
 87 |         return self.data.__repr__()
 88 | 
 89 |     def _repr_html_(self):
 90 |         return self.data._repr_html_()
 91 | 
 92 |     def __len__(self):
 93 |         return self._backend.__len__()
 94 | 
 95 |     def search_columns(
 96 |         self,
 97 |         roles: ABCRole | Iterable[ABCRole],
 98 |         tmp_role=False,
 99 |         search_types: list | None = None,
100 |     ) -> list[str]:
101 |         roles = roles if isinstance(roles, Iterable) else [roles]
102 |         roles_for_search = self._tmp_roles if tmp_role else self.roles
103 |         return [
104 |             str(column)
105 |             for column, role in roles_for_search.items()
106 |             if any(
107 |                 isinstance(r, role.__class__)
108 |                 and (not search_types or role.data_type in search_types)
109 |                 for r in roles
110 |             )
111 |         ]
112 | 
113 |     def replace_roles(
114 |         self,
115 |         new_roles_map: dict[ABCRole | str] | ABCRole,
116 |         tmp_role: bool = False,
117 |         auto_roles_types: bool = False,
118 |     ):
119 |         new_roles_map = parse_roles(
120 |             {
121 |                 role: (
122 |                     self.search_columns(column, tmp_role)
123 |                     if isinstance(column, ABCRole)
124 |                     else column
125 |                 )
126 |                 for column, role in new_roles_map.items()
127 |             }
128 |         )
129 | 
130 |         new_roles = {
131 |             column: new_roles_map[column] if column in new_roles_map else role
132 |             for column, role in self.roles.items()
133 |         }
134 | 
135 |         if tmp_role:
136 |             self._tmp_roles = new_roles
137 |         else:
138 |             self.roles = new_roles
139 |             if auto_roles_types:
140 |                 self._set_empty_types(new_roles_map)
141 | 
142 |         return self
143 | 
144 |     @property
145 |     def index(self):
146 |         return self._backend.index
147 | 
148 |     @property
149 |     def data(self):
150 |         return self._backend.data
151 | 
152 |     @property
153 |     def roles(self):
154 |         return self._roles
155 | 
156 |     @roles.setter
157 |     def roles(self, value):
158 |         self._set_roles(new_roles_map=value, temp_role=False)
159 | 
160 |     @data.setter
161 |     def data(self, value):
162 |         self._backend.data = value
163 | 
164 |     @property
165 |     def columns(self):
166 |         return self._backend.columns
167 | 
168 |     @property
169 |     def shape(self):
170 |         return self._backend.shape
171 | 
172 |     @property
173 |     def tmp_roles(self):
174 |         return self._tmp_roles
175 | 
176 |     @tmp_roles.setter
177 |     def tmp_roles(self, value):
178 |         self._set_roles(new_roles_map=value, temp_role=True)
179 |         self._set_empty_types(self._tmp_roles)
180 | 
181 |     def to_dict(self):
182 |         return {
183 |             "backend": self._backend.name,
184 |             "roles": {
185 |                 "role_names": list(map(lambda x: x, list(self.roles.keys()))),
186 |                 "columns": list(self.roles.values()),
187 |             },
188 |             "data": self._backend.to_dict(),
189 |         }
190 | 
191 |     def to_records(self):
192 |         return self._backend.to_records()
193 | 
194 |     def to_json(self, filename: str | None = None):
195 |         if not filename:
196 |             return json.dumps(self.to_dict())
197 |         with open(filename, "w") as file:
198 |             json.dump(self.to_dict(), file)
199 | 
200 |     @property
201 |     def backend(self):
202 |         return self._backend
203 | 
204 |     def get_values(
205 |         self,
206 |         row: str | None = None,
207 |         column: str | None = None,
208 |     ) -> Any:
209 |         return self._backend.get_values(row=row, column=column)
210 | 
211 |     def iget_values(
212 |         self,
213 |         row: int | None = None,
214 |         column: int | None = None,
215 |     ) -> Any:
216 |         return self._backend.iget_values(row=row, column=column)
217 | 
218 |     def _set_roles(
219 |         self,
220 |         new_roles_map: dict[ABCRole, list[str] | str] | dict[list[str] | str] | ABCRole,
221 |         temp_role: bool = False,
222 |     ):
223 |         if not new_roles_map:
224 |             if not temp_role:
225 |                 return self.roles
226 |             else:
227 |                 self._tmp_roles = {}
228 |                 return self
229 | 
230 |         keys, values = list(new_roles_map.keys()), list(new_roles_map.values())
231 |         roles, columns_sets = (
232 |             (keys, values) if isinstance(keys[0], ABCRole) else (values, keys)
233 |         )
234 | 
235 |         new_roles = {}
236 |         for role, columns in zip(roles, columns_sets):
237 |             if isinstance(columns, list):
238 |                 for column in columns:
239 |                     new_roles[column] = copy.deepcopy(role)
240 |             else:
241 |                 new_roles[columns] = copy.deepcopy(role)
242 | 
243 |         if temp_role:
244 |             self._tmp_roles = new_roles
245 |         else:
246 |             self._roles = new_roles
247 | 
248 |         return self
249 | 


--------------------------------------------------------------------------------