├── hypex ├── forks │ ├── __init__.py │ └── aa.py ├── ui │ ├── __init__.py │ ├── homo.py │ ├── aa.py │ ├── ab.py │ ├── matching.py │ └── base.py ├── analyzers │ ├── __init__.py │ ├── matching.py │ └── ab.py ├── encoders │ ├── __init__.py │ ├── encoders.py │ └── abstract.py ├── factory │ ├── __init__.py │ └── base.py ├── __version__.py ├── operators │ ├── __init__.py │ └── abstract.py ├── hypotheses │ ├── __init__.py │ └── hypothesis.py ├── ml │ ├── __init__.py │ └── faiss.py ├── dataset │ ├── backends │ │ └── __init__.py │ ├── __init__.py │ ├── roles.py │ └── abstract.py ├── splitters │ ├── __init__.py │ └── aa.py ├── executor │ └── __init__.py ├── utils │ ├── constants.py │ ├── adapter.py │ ├── enums.py │ ├── typings.py │ ├── __init__.py │ ├── errors.py │ └── decorator.py ├── experiments │ ├── __init__.py │ ├── base.py │ └── base_complex.py ├── __init__.py ├── reporters │ ├── __init__.py │ ├── homo.py │ ├── ab.py │ ├── matching.py │ ├── abstract.py │ └── aa.py ├── transformers │ ├── __init__.py │ ├── abstract.py │ ├── shuffle.py │ ├── category_agg.py │ └── na_filler.py ├── comparators │ ├── __init__.py │ ├── hypothesis_testing.py │ ├── comparators.py │ ├── power_testing.py │ └── distances.py ├── extensions │ ├── __init__.py │ ├── scipy_linalg.py │ ├── encoders.py │ ├── abstract.py │ ├── faiss.py │ ├── scipy_stats.py │ └── statsmodels.py ├── preprocessing.py ├── homogeneity.py └── ab.py ├── .flake8 ├── docs ├── requirements.txt ├── _templates │ ├── functiontemplate.rst │ ├── classtemplate.rst │ └── autosummary │ │ ├── class.rst │ │ └── module.rst ├── _static │ ├── style.css │ └── custom.css ├── api_reference.rst ├── index.rst ├── Makefile ├── installation.rst ├── mock_docs.py ├── quickstart.rst └── conf.py ├── coverage.sh ├── examples └── experiments │ └── performance_test │ ├── config.json │ └── config.schema.json ├── tests ├── test_example.py └── test_tutorials.py ├── .readthedocs.yaml ├── .github ├── PULL_REQUEST_TEMPLATE │ ├── docs.md │ ├── feature_request.md │ └── bug.md ├── ISSUE_TEMPLATE │ ├── bug.md │ ├── feature_request.md │ └── question.md └── workflows │ └── ci.yml ├── tox.ini ├── pyproject.toml ├── .gitignore └── schemes └── architecture_levels.md /hypex/forks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hypex/ui/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hypex/analyzers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hypex/encoders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hypex/factory/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hypex/__version__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.0.2" 2 | -------------------------------------------------------------------------------- /hypex/operators/__init__.py: -------------------------------------------------------------------------------- 1 | from .operators import SMD 2 | 3 | __all__ = ["SMD"] 4 | -------------------------------------------------------------------------------- /hypex/hypotheses/__init__.py: -------------------------------------------------------------------------------- 1 | # from .hypothesis import Hypothesis 2 | 3 | # __all__ = ["Hypothesis"] 4 | -------------------------------------------------------------------------------- /hypex/ml/__init__.py: -------------------------------------------------------------------------------- 1 | from .faiss import FaissNearestNeighbors 2 | 3 | __all__ = ["FaissNearestNeighbors"] 4 | -------------------------------------------------------------------------------- /hypex/dataset/backends/__init__.py: -------------------------------------------------------------------------------- 1 | from .pandas_backend import PandasDataset 2 | 3 | __all__ = ["PandasDataset"] 4 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | ignore = E203, E266, E501, W503 4 | exclude = .venv, .git, __pycache__, build, dist 5 | -------------------------------------------------------------------------------- /hypex/splitters/__init__.py: -------------------------------------------------------------------------------- 1 | from .aa import AASplitter, AASplitterWithStratification 2 | 3 | __all__ = ["AASplitter", "AASplitterWithStratification"] 4 | -------------------------------------------------------------------------------- /hypex/executor/__init__.py: -------------------------------------------------------------------------------- 1 | from .executor import Calculator, Executor, IfExecutor, MLExecutor 2 | 3 | __all__ = ["Calculator", "Executor", "IfExecutor", "MLExecutor"] 4 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | ipykernel 2 | nbsphinx 3 | nbsphinx-link 4 | sphinx-autodoc-typehints 5 | pandoc 6 | jupyter 7 | prompt-toolkit<3.0.0,!=3.0.1,>=2.0.0 8 | sphinx_rtd_theme 9 | -------------------------------------------------------------------------------- /hypex/utils/constants.py: -------------------------------------------------------------------------------- 1 | ID_SPLIT_SYMBOL = "\u2534" 2 | NAME_BORDER_SYMBOL = "\u2506" 3 | MATCHING_INDEXES_SPLITTER_SYMBOL = "\u256f" 4 | 5 | NUMBER_TYPES_LIST = [int, float] 6 | -------------------------------------------------------------------------------- /hypex/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import Experiment, OnRoleExperiment 2 | from .base_complex import CycledExperiment, GroupExperiment 3 | 4 | __all__ = ["CycledExperiment", "Experiment", "GroupExperiment", "OnRoleExperiment"] 5 | -------------------------------------------------------------------------------- /hypex/__init__.py: -------------------------------------------------------------------------------- 1 | from .__version__ import __version__ 2 | from .aa import AATest 3 | from .ab import ABTest 4 | from .homogeneity import HomogeneityTest 5 | from .matching import Matching 6 | 7 | __all__ = ["AATest", "ABTest", "HomogeneityTest", "Matching", "__version__"] 8 | -------------------------------------------------------------------------------- /coverage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PYTHONPATH=$PYTHONPATH:. 4 | export PYTHONWARNINGS="ignore" 5 | 6 | coverage run --include="hypex/dataset/*" unitests/unitests.py 7 | 8 | # coverage report -m 9 | 10 | coverage html -d unitests/coverage_report 11 | 12 | rm -f .coverage -------------------------------------------------------------------------------- /hypex/reporters/__init__.py: -------------------------------------------------------------------------------- 1 | from .abstract import DatasetReporter, DictReporter, Reporter 2 | from .homo import HomoDatasetReporter, HomoDictReporter 3 | 4 | __all__ = [ 5 | "DatasetReporter", 6 | "DictReporter", 7 | "HomoDatasetReporter", 8 | "HomoDictReporter", 9 | "Reporter", 10 | ] 11 | -------------------------------------------------------------------------------- /docs/_templates/functiontemplate.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | .. currentmodule:: {{ module }} 4 | 5 | {{ name | underline }} 6 | 7 | .. autofunction:: {{ fullname }} 8 | 9 | .. 10 | autogenerated from source/_templates/functiontemplate.rst 11 | note it does not have :inherited-members: 12 | -------------------------------------------------------------------------------- /docs/_templates/classtemplate.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | .. currentmodule:: {{ module }} 4 | 5 | 6 | {{ name | underline }} 7 | 8 | .. autoclass:: {{ name }} 9 | :members: 10 | 11 | 12 | .. 13 | autogenerated from source/_templates/classtemplate.rst 14 | note it does not have :inherited-members: 15 | -------------------------------------------------------------------------------- /docs/_templates/autosummary/class.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | .. currentmodule:: {{ module }} 4 | 5 | 6 | {{ name | underline}} 7 | 8 | .. autoclass:: {{ name }} 9 | :members: 10 | 11 | 12 | .. 13 | autogenerated from source/_templates/autosummary/class.rst 14 | note it does not have :inherited-members: 15 | -------------------------------------------------------------------------------- /hypex/ui/homo.py: -------------------------------------------------------------------------------- 1 | from ..dataset import Dataset, ExperimentData 2 | from ..reporters.homo import HomoDatasetReporter 3 | from .base import Output 4 | 5 | 6 | class HomoOutput(Output): 7 | resume: Dataset 8 | 9 | def __init__(self): 10 | super().__init__(resume_reporter=HomoDatasetReporter()) 11 | 12 | def extract(self, experiment_data: ExperimentData): 13 | super().extract(experiment_data) 14 | -------------------------------------------------------------------------------- /hypex/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | from ..encoders.encoders import DummyEncoder 2 | from .category_agg import CategoryAggregator 3 | from .filters import ConstFilter, CorrFilter, CVFilter, NanFilter, OutliersFilter 4 | from .na_filler import NaFiller 5 | from .shuffle import Shuffle 6 | 7 | __all__ = [ 8 | "CVFilter", 9 | "CVFilter", 10 | "CategoryAggregator", 11 | "ConstFilter", 12 | "CorrFilter", 13 | "DummyEncoder", 14 | "NaFiller", 15 | "NanFilter", 16 | "OutliersFilter", 17 | "Shuffle", 18 | ] 19 | -------------------------------------------------------------------------------- /docs/_static/style.css: -------------------------------------------------------------------------------- 1 | .wy-nav-content { 2 | max-width: none; 3 | } 4 | 5 | .rst-content code.xref { 6 | /* !important prevents the common CSS stylesheets from overriding 7 | this as on RTD they are loaded after this stylesheet */ 8 | color: #E74C3C 9 | } 10 | 11 | html.writer-html4 .rst-content dl:not(.docutils) dl:not(.field-list)>dt, html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) dl:not(.field-list)>dt { 12 | border-left-color: rgb(9, 183, 14) 13 | } 14 | -------------------------------------------------------------------------------- /hypex/encoders/encoders.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ..dataset import Dataset 4 | from ..extensions.encoders import DummyEncoderExtension 5 | from .abstract import Encoder 6 | 7 | 8 | class DummyEncoder(Encoder): 9 | @staticmethod 10 | def _inner_function( 11 | data: Dataset, target_cols: str | None = None, **kwargs 12 | ) -> Dataset: 13 | if not target_cols: 14 | return data 15 | return DummyEncoderExtension().calc( 16 | data=data, target_cols=target_cols, **kwargs 17 | ) 18 | -------------------------------------------------------------------------------- /hypex/comparators/__init__.py: -------------------------------------------------------------------------------- 1 | from .abstract import Comparator, StatHypothesisTesting 2 | from .comparators import PSI, GroupDifference, GroupSizes 3 | from .distances import MahalanobisDistance 4 | from .hypothesis_testing import Chi2Test, KSTest, TTest, UTest 5 | from .power_testing import MDEBySize, PowerTesting 6 | 7 | __all__ = [ 8 | "PSI", 9 | "Chi2Test", 10 | "Comparator", 11 | "GroupDifference", 12 | "GroupSizes", 13 | "KSTest", 14 | "MDEBySize", 15 | "MahalanobisDistance", 16 | "PowerTesting", 17 | "StatHypothesisTesting", 18 | "TTest", 19 | "UTest", 20 | ] 21 | -------------------------------------------------------------------------------- /examples/experiments/performance_test/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "onefactor_params" : { 3 | "n_rows": [10, 100], 4 | "n_columns": [10, 15], 5 | "n_iterations": [10, 20] 6 | }, 7 | "montecarlo_params": { 8 | "num_points" : 10, 9 | "bounds": { 10 | "n_rows" : { 11 | "max" : 10000, 12 | "min" : 100 13 | }, 14 | "n_iterations" : { 15 | "max" : 100, 16 | "min" : 5 17 | }, 18 | "n_columns" : { 19 | "max" : 30, 20 | "min" : 10 21 | } 22 | } 23 | }, 24 | "fixed_params" : { 25 | "n_columns": 10, 26 | "n_rows": 1000, 27 | "n_iterations": 5 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /hypex/reporters/homo.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any 4 | 5 | from ..dataset import Dataset, ExperimentData 6 | from .aa import OneAADictReporter 7 | from .abstract import DatasetReporter 8 | 9 | 10 | class HomoDictReporter(OneAADictReporter): 11 | def report(self, data: ExperimentData) -> dict[str, Any]: 12 | return self.extract_data_from_analysis_tables(data) 13 | 14 | 15 | class HomoDatasetReporter(DatasetReporter): 16 | def __init__(self): 17 | super().__init__(dict_reporter=HomoDictReporter(front=False)) 18 | 19 | @staticmethod 20 | def convert_to_dataset(data: dict) -> dict[str, Dataset] | Dataset: 21 | return HomoDictReporter.convert_flat_dataset(data) 22 | -------------------------------------------------------------------------------- /hypex/utils/adapter.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Sequence 4 | 5 | 6 | class Adapter: 7 | @staticmethod 8 | def to_list(data: Any) -> list: 9 | if data is None: 10 | return [] 11 | if isinstance(data, str): 12 | return [data] 13 | return list(data) if isinstance(data, Sequence) else [data] 14 | 15 | @staticmethod 16 | def list_to_single(data: list) -> Any: 17 | if isinstance(data, list): 18 | if len(data) == 0: 19 | return None 20 | elif len(data) == 1: 21 | return data[0] 22 | else: 23 | raise ValueError("Only a list of a single item can be accepted") 24 | -------------------------------------------------------------------------------- /hypex/extensions/__init__.py: -------------------------------------------------------------------------------- 1 | from .encoders import DummyEncoderExtension 2 | from .faiss import FaissExtension 3 | from .scipy_linalg import CholeskyExtension, InverseExtension 4 | from .scipy_stats import ( 5 | Chi2TestExtension, 6 | KSTestExtension, 7 | TTestExtension, 8 | UTestExtension, 9 | ) 10 | from .statsmodels import MultiTest, MultitestQuantile 11 | 12 | __all__ = [ 13 | "Chi2TestExtension", 14 | "CholeskyExtension", 15 | "DummyEncoderExtension", 16 | "FaissExtension", 17 | "InverseExtension", 18 | "KSTestExtension", 19 | "MultiTest", 20 | "MultitestQuantile", 21 | "TTestExtension", 22 | "UTestExtension", 23 | ] 24 | -------------------------------------------------------------------------------- /hypex/transformers/abstract.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | from ..dataset import Dataset, ExperimentData 4 | from ..executor import Calculator 5 | from ..utils import AbstractMethodError 6 | 7 | 8 | class Transformer(Calculator): 9 | @property 10 | def _is_transformer(self): 11 | return True 12 | 13 | @staticmethod 14 | @abstractmethod 15 | def _inner_function(data: Dataset, **kwargs) -> Dataset: 16 | raise AbstractMethodError 17 | 18 | @classmethod 19 | def calc(cls, data: Dataset, **kwargs): 20 | return cls._inner_function(data, **kwargs) 21 | 22 | def execute(self, data: ExperimentData) -> ExperimentData: 23 | data = data.copy(data=self.calc(data=data.ds)) 24 | return data 25 | -------------------------------------------------------------------------------- /docs/api_reference.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | .. currentmodule:: hypex 5 | 6 | Main Classes 7 | ------------ 8 | 9 | .. autosummary:: 10 | :toctree: _autosummary 11 | :nosignatures: 12 | :template: autosummary/class.rst 13 | 14 | AATest 15 | ABTest 16 | HomogeneityTest 17 | matching.Matching 18 | 19 | Dataset Module 20 | -------------- 21 | 22 | .. autosummary:: 23 | :toctree: _autosummary 24 | :nosignatures: 25 | :template: autosummary/class.rst 26 | 27 | dataset.Dataset 28 | dataset.ExperimentData 29 | 30 | Roles 31 | ~~~~~ 32 | 33 | .. autosummary:: 34 | :toctree: _autosummary 35 | :nosignatures: 36 | 37 | dataset.TargetRole 38 | dataset.TreatmentRole 39 | dataset.FeatureRole 40 | dataset.InfoRole 41 | -------------------------------------------------------------------------------- /tests/test_example.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import pytest 4 | 5 | 6 | @pytest.mark.parametrize("test_input, expected", [(1, 1), (2, 2), (3, 3)]) 7 | def test_example(test_input: Any, expected: Any) -> None: 8 | """ 9 | Tests if the input values are equal to the expected values. 10 | 11 | This test uses parametrization to check multiple pairs of values. 12 | It ensures that each input argument is equal to its expected value. 13 | 14 | Args: 15 | test_input: The input value for the test. 16 | expected: The expected value to compare against the input. 17 | 18 | Returns: 19 | None. The test simply asserts the condition. 20 | """ 21 | assert test_input == expected, f"Expected {expected}, got {test_input}" 22 | return 23 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. HypEx documentation master file 2 | 3 | Welcome to HypEx's documentation! 4 | ================================== 5 | 6 | HypEx is a fast and customizable framework for Causal Inference. 7 | 8 | .. toctree:: 9 | :maxdepth: 2 10 | :caption: Contents: 11 | 12 | installation 13 | quickstart 14 | api_reference 15 | 16 | Installation 17 | ------------ 18 | 19 | .. code-block:: bash 20 | 21 | pip install hypex 22 | 23 | Quick Start 24 | ----------- 25 | 26 | .. code-block:: python 27 | 28 | from hypex import ABTest, AATest, Matching 29 | from hypex.dataset import Dataset, TargetRole, TreatmentRole 30 | 31 | # Your code here 32 | 33 | Indices and tables 34 | ================== 35 | 36 | * :ref:`genindex` 37 | * :ref:`modindex` 38 | * :ref:`search` 39 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the OS, Python version and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.10" 13 | 14 | # Build documentation in the "docs/" directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | # Optionally build your docs in additional formats such as PDF and ePub 19 | formats: all 20 | 21 | # Optional but recommended, declare the Python requirements required 22 | # to build your documentation 23 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 24 | python: 25 | install: 26 | - requirements: docs/requirements.txt 27 | - path: . 28 | -------------------------------------------------------------------------------- /hypex/preprocessing.py: -------------------------------------------------------------------------------- 1 | from .encoders.encoders import DummyEncoder 2 | from .experiments.base import Experiment 3 | from .transformers.category_agg import CategoryAggregator 4 | from .transformers.filters import ( 5 | ConstFilter, 6 | CorrFilter, 7 | CVFilter, 8 | NanFilter, 9 | OutliersFilter, 10 | ) 11 | from .transformers.na_filler import NaFiller 12 | 13 | PREPROCESSING_DATA = Experiment( 14 | executors=[ 15 | NaFiller(method="ffill"), 16 | CategoryAggregator(), 17 | CorrFilter(), 18 | CVFilter(), 19 | NanFilter(), 20 | ConstFilter(), 21 | OutliersFilter(lower_percentile=0.05, upper_percentile=0.95), 22 | DummyEncoder(), 23 | ] 24 | ) 25 | -------------------------------------------------------------------------------- /hypex/transformers/shuffle.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any 4 | 5 | from ..dataset import Dataset, ExperimentData 6 | from ..executor.executor import Calculator 7 | 8 | 9 | class Shuffle(Calculator): 10 | def __init__( 11 | self, 12 | random_state: int | None = None, 13 | key: Any = "", 14 | ): 15 | super().__init__(key) 16 | self.random_state = random_state 17 | 18 | @staticmethod 19 | def _inner_function(data: Dataset, random_state: int | None = None) -> Dataset: 20 | return data.shuffle(random_state=random_state) 21 | 22 | def generate_params_hash(self): 23 | return f"{self.random_state}" 24 | 25 | def execute(self, data: ExperimentData) -> ExperimentData: 26 | result = data.copy(data=self.calc(data=data.ds, random_state=self.random_state)) 27 | return result 28 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | 22 | clean: 23 | sphinx-build -M clean "$(SOURCEDIR)" "$(BUILDDIR)" 24 | sphinx-build -M clean "$(SOURCEDIR)" "imgs" 25 | sphinx-build -M clean "$(SOURCEDIR)" "pages/modules/generated/" 26 | rm -rf "$(SOURCEDIR)/_autosummary" 27 | -------------------------------------------------------------------------------- /hypex/extensions/scipy_linalg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd # type: ignore 3 | 4 | from ..dataset import Dataset 5 | from ..dataset.roles import FeatureRole 6 | from .abstract import Extension 7 | 8 | 9 | class CholeskyExtension(Extension): 10 | def _calc_pandas(self, data: Dataset, epsilon: float = 1e-3, **kwargs): 11 | cov = data.data.to_numpy() 12 | cov = cov + np.eye(cov.shape[0]) * epsilon 13 | return self.result_to_dataset( 14 | pd.DataFrame(np.linalg.cholesky(cov), columns=data.columns), 15 | {column: FeatureRole() for column in data.columns}, 16 | ) 17 | 18 | 19 | class InverseExtension(Extension): 20 | def _calc_pandas(self, data: Dataset, **kwargs): 21 | return self.result_to_dataset( 22 | pd.DataFrame(np.linalg.inv(data.data.to_numpy()), columns=data.columns), 23 | {column: FeatureRole() for column in data.columns}, 24 | ) 25 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | Requirements 5 | ------------ 6 | 7 | * Python 3.8 or higher 8 | * NumPy 9 | * Pandas 10 | * SciPy 11 | * Scikit-learn 12 | * Statsmodels 13 | 14 | Basic Installation 15 | ------------------ 16 | 17 | Install HypEx using pip: 18 | 19 | .. code-block:: bash 20 | 21 | pip install hypex 22 | 23 | Development Installation 24 | ------------------------ 25 | 26 | For development, clone the repository and install in editable mode: 27 | 28 | .. code-block:: bash 29 | 30 | git clone https://github.com/sb-ai-lab/HypEx.git 31 | cd HypEx 32 | pip install -e . 33 | 34 | Optional Dependencies 35 | --------------------- 36 | 37 | For additional functionality, install with extras: 38 | 39 | .. code-block:: bash 40 | 41 | # For CatBoost support 42 | pip install hypex[cat] 43 | 44 | # For LightGBM support 45 | pip install hypex[lgbm] 46 | 47 | # All extras 48 | pip install hypex[all] 49 | -------------------------------------------------------------------------------- /hypex/extensions/encoders.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import copy 4 | 5 | import pandas as pd # type: ignore 6 | 7 | from ..dataset import Dataset, DatasetAdapter 8 | from .abstract import Extension 9 | 10 | 11 | class DummyEncoderExtension( 12 | Extension 13 | ): # TODO: role types are being rewritten, needs to be fixed 14 | @staticmethod 15 | def _calc_pandas(data: Dataset, target_cols: str | None = None, **kwargs): 16 | dummies_df = pd.get_dummies(data=data[target_cols].data, drop_first=True) 17 | # Setting roles to the dummies in additional fields based on the original 18 | # roles by searching based on the part of the dummy column name 19 | roles = {col: data.roles[col[: col.rfind("_")]] for col in dummies_df.columns} 20 | new_roles = copy.deepcopy(roles) 21 | for role in roles.values(): 22 | role.data_type = bool 23 | return DatasetAdapter.to_dataset(dummies_df, roles=new_roles) 24 | -------------------------------------------------------------------------------- /hypex/utils/enums.py: -------------------------------------------------------------------------------- 1 | import enum 2 | 3 | 4 | @enum.unique 5 | class ExperimentDataEnum(enum.Enum): 6 | variables = "variables" 7 | additional_fields = "additional_fields" 8 | analysis_tables = "analysis_tables" 9 | groups = "groups" 10 | 11 | 12 | @enum.unique 13 | class BackendsEnum(enum.Enum): 14 | pandas = "pandas" 15 | 16 | 17 | @enum.unique 18 | class SpaceEnum(enum.Enum): 19 | auto = "auto" 20 | additional = "additional" 21 | data = "data" 22 | 23 | 24 | @enum.unique 25 | class ABNTestMethodsEnum(enum.Enum): 26 | bonferroni = "bonferroni" 27 | sidak = "sidak" 28 | holm_sidak = "holm-sidak" 29 | holm = "holm" 30 | simes_hochberg = "simes-hochberg" 31 | hommel = "hommel" 32 | fdr_bh = "fdr_bh" 33 | fdr_by = "fdr_by" 34 | fdr_tsbh = "fdr_tsbh" 35 | fdr_tsbky = "fdr_tsbky" 36 | quantile = "quantile" 37 | 38 | 39 | @enum.unique 40 | class RenameEnum(enum.Enum): 41 | all = "all" 42 | columns = "columns" 43 | index = "index" 44 | -------------------------------------------------------------------------------- /docs/_templates/autosummary/module.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | {{ name | underline }} 5 | 6 | .. automodule:: {{ fullname }} 7 | 8 | {% block classes %} 9 | {% if classes %} 10 | 11 | .. rubric:: {{ _('Classes') }} 12 | 13 | .. autosummary:: 14 | :toctree: generated 15 | :nosignatures: 16 | :template: classtemplate.rst 17 | 18 | {% for item in classes %} 19 | {{ item }} 20 | {% endfor %} 21 | {% endif %} 22 | {% endblock %} 23 | 24 | {% block functions %} 25 | {% if functions %} 26 | 27 | .. rubric:: {{ _('Functions') }} 28 | 29 | .. autosummary:: 30 | :toctree: generated 31 | :nosignatures: 32 | :template: functiontemplate.rst 33 | 34 | {% for item in functions %} 35 | {{ item }} 36 | {% endfor %} 37 | {% endif %} 38 | {% endblock %} 39 | 40 | {% block modules %} 41 | {% if modules %} 42 | 43 | .. rubric:: {{ _('Modules') }} 44 | 45 | .. autosummary:: 46 | :toctree: 47 | :recursive: 48 | 49 | {% for item in modules %} 50 | {{ item }} 51 | {% endfor %} 52 | {% endif %} 53 | {% endblock %} 54 | -------------------------------------------------------------------------------- /hypex/utils/typings.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from typing import ( 3 | TYPE_CHECKING, 4 | Any, 5 | Callable, 6 | Dict, 7 | List, 8 | Sequence, 9 | Tuple, 10 | TypeVar, 11 | Union, 12 | ) 13 | 14 | if TYPE_CHECKING: 15 | from hypex.dataset import Dataset 16 | 17 | StratificationRoleTypes = Union[float, str, datetime.datetime] 18 | DefaultRoleTypes = Union[float, bool, str, int] 19 | TargetRoleTypes = Union[float, int, bool] 20 | CategoricalTypes = Union[str] 21 | ScalarType = Union[float, int, str, bool] 22 | GroupingDataType = Tuple[List[Tuple[str, "Dataset"]], List[Tuple[str, "Dataset"]]] 23 | 24 | 25 | MultiFieldKeyTypes = Union[str, Sequence[str]] 26 | 27 | FromDictTypes = Union[ 28 | Dict[str, List[Any]], 29 | List[Dict[Any, Any]], 30 | Dict[str, Dict[Any, List]], 31 | Dict[str, "Dataset"], 32 | ] 33 | RoleNameType = str 34 | DecoratedType = TypeVar("DecoratedType", bound=Union[Callable[..., Any], property]) 35 | DocstringInheritDecorator = Callable[[DecoratedType], DecoratedType] 36 | 37 | SetParamsDictTypes = Union[Dict[str, Any], Dict[type, Dict[str, Any]]] 38 | -------------------------------------------------------------------------------- /hypex/analyzers/matching.py: -------------------------------------------------------------------------------- 1 | from ..dataset.dataset import DatasetAdapter, ExperimentData 2 | from ..dataset.roles import StatisticRole 3 | from ..executor.executor import Executor 4 | from ..operators.operators import MatchingMetrics 5 | from ..utils.enums import ExperimentDataEnum 6 | 7 | 8 | class MatchingAnalyzer(Executor): 9 | def _set_value(self, data: ExperimentData, value, key=None) -> ExperimentData: 10 | return data.set_value( 11 | ExperimentDataEnum.analysis_tables, self.id, value, key=key 12 | ) 13 | 14 | def execute(self, data: ExperimentData): 15 | variables = data.variables[ 16 | data.get_one_id(MatchingMetrics, space=ExperimentDataEnum.variables) 17 | ] 18 | columns = ["Effect Size", "Standard Error", "P-value", "CI Lower", "CI Upper"] 19 | return self._set_value( 20 | data, 21 | DatasetAdapter.to_dataset( 22 | variables, 23 | {field: StatisticRole() for field in list(variables.keys())}, 24 | ).transpose(roles={column: StatisticRole() for column in columns}), 25 | ) 26 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE/docs.md: -------------------------------------------------------------------------------- 1 | ### Documentation Update Description 2 | 3 | 4 | 5 | ### Areas of Documentation Updated 6 | 7 | 8 | 9 | 1. 10 | 2. 11 | 3. 12 | 13 | ### Details of Changes 14 | 15 | 16 | 17 | ### Screenshots / Code Snippets 18 | 19 | 20 | 21 | ### Related Issues or Pull Requests 22 | 23 | 24 | 25 | ### Additional Notes 26 | 27 | 28 | 29 | ### Checklist 30 | 31 | - [ ] The changes are clear and easy to understand. 32 | - [ ] I have verified that the changes are accurate and necessary. 33 | - [ ] The updated documentation has been tested for clarity and comprehensibility. 34 | - [ ] All modified sections are properly formatted and adhere to project documentation standards. 35 | -------------------------------------------------------------------------------- /hypex/forks/aa.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ..analyzers.aa import OneAAStatAnalyzer 4 | from ..executor.executor import Executor, IfExecutor 5 | from ..utils.enums import ExperimentDataEnum 6 | 7 | 8 | class IfAAExecutor(IfExecutor): 9 | def __init__( 10 | self, 11 | if_executor: Executor | None = None, 12 | else_executor: Executor | None = None, 13 | sample_size: float | None = None, 14 | key: str = "", 15 | ): 16 | self.sample_size = sample_size 17 | super().__init__(if_executor, else_executor, key) 18 | 19 | def check_rule(self, data, **kwargs) -> bool: 20 | if self.sample_size is not None: 21 | score_table_id = data.get_one_id( 22 | OneAAStatAnalyzer, 23 | ExperimentDataEnum.analysis_tables, 24 | ) 25 | score_table = data.analysis_tables[score_table_id] 26 | feature_pass = sum( 27 | [ 28 | score_table.loc[:, column].get_values()[0][0] 29 | for column in score_table.columns 30 | if "pass" in column 31 | ] 32 | ) 33 | return True if feature_pass >= 1 else False 34 | return False 35 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | ### Description 2 | 3 | 4 | 5 | ### Changes Made 6 | 7 | 8 | 9 | ### Related Issues 10 | 11 | 12 | 13 | ### Additional Notes 14 | 15 | 16 | 17 | ### Testing and Validation 18 | 19 | 20 | 21 | ### Performance Considerations 22 | 23 | 24 | 25 | ### Breaking Changes 26 | 27 | 28 | 29 | ### Dependencies 30 | 31 | 32 | 33 | ### Merge Request Checklist 34 | 35 | - [ ] Code follows project coding guidelines. 36 | - [ ] Documentation reflects the changes made. 37 | - [ ] Unit tests cover new or changed functionality. 38 | - [ ] Performance and breaking changes have been considered. 39 | -------------------------------------------------------------------------------- /docs/mock_docs.py: -------------------------------------------------------------------------------- 1 | """A one line summary of the module or program, terminated by a period. 2 | 3 | Leave one blank line. The rest of this docstring should contain an 4 | overall description of the module or program. Optionally, it may also 5 | contain a brief description of exported classes and functions and/or usage 6 | examples. 7 | 8 | Typical usage example: 9 | 10 | >>> print('something') 11 | something 12 | >>> a = MyClass('be', 'or', 'not') 13 | 14 | """ 15 | 16 | import datetime 17 | 18 | 19 | class MyClass: 20 | """Description of class. 21 | 22 | Really do nothing. 23 | 24 | Attributes: 25 | attr1 (str): Description of `attr1`. 26 | attr2 (str): Description of `attr2`. 27 | 28 | Args: 29 | attr1: Description of `attr1`. 30 | attr2: Description of `attr2`. 31 | 32 | 33 | """ 34 | 35 | def __init__(self, attr1: str, attr2: str): 36 | self.attr1 = attr1 37 | self.attr2 = attr2 38 | date = datetime.datetime.now() 39 | print( 40 | f"{date.day}.{date.month}.{date.year} {date.hour}:{date.minute}:{date.second}" 41 | ) 42 | 43 | 44 | # .. toctree:: 45 | # :glob: 46 | # :maxdepth: 1 47 | # :caption: Tutorials 48 | # 49 | # tutorials/tutor_1.ipynb 50 | # tutorials/tutor_2.ipynb 51 | # tutorials/tutor_3.ipynb 52 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Create a report to help us improve 4 | title: '[BUG] ' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## 🐛 Bug Description 11 | 12 | 13 | 14 | ### Steps To Reproduce 15 | 16 | 17 | 18 | 1. Go to '...' 19 | 2. Click on '....' 20 | 3. Scroll down to '....' 21 | 4. See error 22 | 23 | ### Expected Behavior 24 | 25 | 26 | 27 | ### Screenshots 28 | 29 | 30 | 31 | ### Environment 32 | 33 | 34 | 35 | - HypEx Version: [e.g. 0.0.4] 36 | - Python Version: [e.g. 3.8] 37 | - Operating System: [e.g. iOS, Windows, Linux] 38 | 39 | ### Additional Context 40 | 41 | 42 | 43 | ### Possible Solution 44 | 45 | 46 | 47 | ### Checklist 48 | 49 | - [ ] I have described the bug in detail 50 | - [ ] I have provided steps to reproduce 51 | - [ ] I have provided the expected behavior 52 | - [ ] I have provided screenshots (if applicable) 53 | - [ ] I have provided my environment details 54 | - [ ] I have suggested a possible solution (if applicable) 55 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature Request 3 | about: Suggest an idea for this project 4 | title: '[FEATURE] ' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## 🚀 Feature Proposal 11 | 12 | 13 | 14 | ### Motivation 15 | 16 | 17 | 18 | ### Feature Description 19 | 20 | 21 | 22 | ### Potential Impacts 23 | 24 | 29 | 30 | ### Alternatives 31 | 32 | 33 | 34 | ### Additional Context 35 | 36 | 37 | 38 | ### Checklist 39 | 40 | - [ ] I have clearly described the feature. 41 | - [ ] I have outlined the motivation for the proposal. 42 | - [ ] I have provided a detailed description of the feature. 43 | - [ ] I have discussed potential impacts and alternatives. 44 | - [ ] I have added any additional context or screenshots. 45 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE/bug.md: -------------------------------------------------------------------------------- 1 | ### Bug Description 2 | 3 | 4 | 5 | ### Steps to Reproduce 6 | 7 | 8 | 9 | 1. 10 | 2. 11 | 3. 12 | 4. 13 | 14 | ### Expected Behavior 15 | 16 | 17 | 18 | ### Actual Behavior 19 | 20 | 21 | 22 | ### Changes Made 23 | 24 | 25 | 26 | ### Testing Performed 27 | 28 | 29 | 30 | ### Related Issues 31 | 32 | 33 | 34 | ### Additional Notes 35 | 36 | 37 | 38 | ### Checklist 39 | 40 | - [ ] The code follows project coding guidelines. 41 | - [ ] I have added tests to cover my changes. 42 | - [ ] All new and existing tests passed. 43 | - [ ] Documentation has been updated to reflect the changes made. 44 | - [ ] I have verified that the changes fix the issue as described. 45 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: How to Question 3 | about: Ask for guidance or clarification on how to use the project 4 | title: '[QUESTION] ' 5 | labels: help wanted, question 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## ❓ How to Question 11 | 12 | ### Context 13 | 14 | 15 | 16 | ### Question 17 | 18 | 19 | 20 | ### What I've Tried 21 | 22 | 23 | 24 | ### Code (if applicable) 25 | 26 | 27 | 28 | ### Research 29 | 30 | 1. Have you searched for similar questions in existing issues? 31 | 2. Have you consulted the documentation to find an answer? 32 | 33 | ### Additional Context 34 | 35 | 36 | 37 | ### Checklist 38 | 39 | - [ ] I have provided context for my question. 40 | - [ ] I have stated my question clearly. 41 | - [ ] I have shared what I've tried and what I've learned. 42 | - [ ] I have checked existing issues and documentation. 43 | - [ ] I have provided code and additional context if applicable. 44 | -------------------------------------------------------------------------------- /docs/quickstart.rst: -------------------------------------------------------------------------------- 1 | Quick Start Guide 2 | ================= 3 | 4 | This guide will help you get started with HypEx. 5 | 6 | Basic Usage 7 | ----------- 8 | 9 | A/B Testing 10 | ~~~~~~~~~~~ 11 | 12 | .. code-block:: python 13 | 14 | from hypex import ABTest 15 | from hypex.dataset import Dataset, TargetRole, TreatmentRole 16 | import pandas as pd 17 | 18 | # Load your data 19 | df = pd.read_csv('your_data.csv') 20 | 21 | # Create dataset with roles 22 | data = Dataset( 23 | roles={ 24 | 'conversion': TargetRole(), 25 | 'group': TreatmentRole(), 26 | 'feature1': FeatureRole(), 27 | 'feature2': FeatureRole() 28 | }, 29 | data=df 30 | ) 31 | 32 | # Run A/B test 33 | ab_test = ABTest() 34 | results = ab_test.execute(data) 35 | 36 | # View results 37 | print(results.resume) 38 | 39 | A/A Testing 40 | ~~~~~~~~~~~ 41 | 42 | .. code-block:: python 43 | 44 | from hypex import AATest 45 | 46 | # Run A/A test to check for sample ratio mismatch 47 | aa_test = AATest( 48 | n_iterations=100, 49 | stratification=True 50 | ) 51 | results = aa_test.execute(data) 52 | 53 | # Check if splits are good 54 | print(results.resume) 55 | 56 | Matching 57 | ~~~~~~~~ 58 | 59 | .. code-block:: python 60 | 61 | from hypex import Matching 62 | 63 | # Perform matching analysis 64 | matching = Matching( 65 | distance="mahalanobis", 66 | metric="att" 67 | ) 68 | results = matching.execute(data) 69 | 70 | # View matched pairs and treatment effects 71 | print(results.resume) 72 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Continuous Integration 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | paths-ignore: 7 | - "docs/**" 8 | - "*.md" 9 | pull_request: 10 | branches: [ master ] 11 | paths-ignore: 12 | - "docs/**" 13 | - "*.md" 14 | 15 | jobs: 16 | test: 17 | runs-on: ${{ matrix.os }} 18 | strategy: 19 | matrix: 20 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 21 | os: [ubuntu-latest, macos-latest, windows-latest] 22 | steps: 23 | - uses: actions/checkout@v4 24 | 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v4 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | 30 | - name: Install dependencies for tox 31 | run: | 32 | python -m pip install --upgrade pip 33 | pip install tox 34 | pip install poetry 35 | pip install pytest 36 | poetry install --no-root --without dev 37 | 38 | - name: Run unit tests 39 | run: tox -e py 40 | 41 | linters: 42 | runs-on: ubuntu-latest # Линтеры и доки только на одной версии 43 | steps: 44 | - uses: actions/checkout@v4 45 | 46 | - name: Set up Python 3.10 47 | uses: actions/setup-python@v4 48 | with: 49 | python-version: "3.10" 50 | 51 | - name: Install dependencies for linters 52 | run: | 53 | python -m pip install --upgrade pip 54 | pip install tox 55 | pip install poetry 56 | poetry install --no-root 57 | 58 | - name: Run linters (mypy, codespell, docs) 59 | run: | 60 | tox -------------------------------------------------------------------------------- /hypex/reporters/ab.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, ClassVar 4 | 5 | from ..analyzers.ab import ABAnalyzer 6 | from ..comparators import Chi2Test, TTest, UTest 7 | from ..dataset import Dataset, ExperimentData 8 | from ..utils import ExperimentDataEnum 9 | from .aa import OneAADictReporter 10 | 11 | 12 | class ABDictReporter(OneAADictReporter): 13 | tests: ClassVar[list] = [TTest, UTest, Chi2Test] 14 | 15 | def extract_analyzer_data(self, data: ExperimentData) -> dict[str, Any]: 16 | analyzer_id = data.get_one_id(ABAnalyzer, ExperimentDataEnum.analysis_tables) 17 | return self.extract_from_one_row_dataset(data.analysis_tables[analyzer_id]) 18 | 19 | def extract_data_from_analysis_tables(self, data: ExperimentData) -> dict[str, Any]: 20 | result = {} 21 | result.update(self.extract_group_sizes(data)) 22 | result.update(self.extract_group_difference(data)) 23 | result.update(self.extract_tests(data)) 24 | result.update(self.extract_analyzer_data(data)) 25 | return result 26 | 27 | def report(self, data: ExperimentData) -> dict[str, Any]: 28 | return self.extract_data_from_analysis_tables(data) 29 | 30 | 31 | class ABDatasetReporter(ABDictReporter): 32 | @staticmethod 33 | def _invert_aa_format(table: Dataset) -> Dataset: 34 | return table.replace("NOT OK", "N").replace("OK", "NOT OK").replace("N", "OK") 35 | 36 | def report(self, data: ExperimentData): 37 | front_buffer = self.front 38 | self.front = False 39 | dict_report = super().report(data) 40 | self.front = front_buffer 41 | result = self.convert_flat_dataset(dict_report) 42 | return self._invert_aa_format(result) 43 | -------------------------------------------------------------------------------- /hypex/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | """__init__.py for the dataset module in the HypEx library. 2 | This module defines data structures and roles used across the library for managing and manipulating experimental data. 3 | """ 4 | 5 | from .abstract import DatasetBase 6 | from .dataset import Dataset, DatasetAdapter, ExperimentData 7 | from .roles import ( 8 | ABCRole, 9 | AdditionalGroupingRole, 10 | AdditionalMatchingRole, 11 | AdditionalPreTargetRole, 12 | AdditionalTargetRole, 13 | AdditionalTreatmentRole, 14 | ConstGroupRole, 15 | DefaultRole, 16 | FeatureRole, 17 | FilterRole, 18 | GroupingRole, 19 | InfoRole, 20 | PreTargetRole, 21 | StatisticRole, 22 | StratificationRole, 23 | TargetRole, 24 | TempGroupingRole, 25 | TempRole, 26 | TempTargetRole, 27 | TempTreatmentRole, 28 | TreatmentRole, 29 | default_roles, 30 | ) 31 | 32 | __all__ = [ 33 | "ABCRole", 34 | "AdditionalGroupingRole", 35 | "AdditionalMatchingRole", 36 | "AdditionalPreTargetRole", 37 | "AdditionalTargetRole", 38 | "AdditionalTreatmentRole", 39 | "ConstGroupRole", 40 | "Dataset", 41 | "DatasetAdapter", 42 | "DatasetBase", 43 | "DefaultRole", 44 | "ExperimentData", 45 | "FeatureRole", 46 | "FilterRole", 47 | "GroupingRole", 48 | "InfoRole", 49 | "PreTargetRole", 50 | "StatisticRole", 51 | "StratificationRole", 52 | "TargetRole", 53 | "TempGroupingRole", 54 | "TempRole", 55 | "TempTargetRole", 56 | "TempTreatmentRole", 57 | "TreatmentRole", 58 | "default_roles", 59 | ] 60 | -------------------------------------------------------------------------------- /hypex/encoders/abstract.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Sequence 4 | 5 | from ..dataset import Dataset, ExperimentData, FeatureRole 6 | from ..executor import Calculator 7 | from ..utils import ( 8 | NAME_BORDER_SYMBOL, 9 | AbstractMethodError, 10 | CategoricalTypes, 11 | ExperimentDataEnum, 12 | ) 13 | 14 | 15 | class Encoder(Calculator): 16 | def __init__( 17 | self, 18 | target_roles: str | Sequence[str] | None = None, 19 | key: Any = "", 20 | ): 21 | self.target_roles = target_roles or FeatureRole() 22 | self._key = key 23 | super().__init__(key) 24 | 25 | @property 26 | def __is_encoder(self): 27 | return True 28 | 29 | @property 30 | def search_types(self): 31 | return [CategoricalTypes] 32 | 33 | def _get_ids(self, col_name): 34 | self.key = f"{NAME_BORDER_SYMBOL}{col_name}{NAME_BORDER_SYMBOL}" 35 | return self.id 36 | 37 | def _ids_to_names(self, col_names: list[str]): 38 | return {col_name: self._get_ids(col_name) for col_name in col_names} 39 | 40 | @staticmethod 41 | def _inner_function(data: Dataset, **kwargs) -> Dataset: 42 | raise AbstractMethodError 43 | 44 | def _set_value( 45 | self, data: ExperimentData, value: Dataset, key=None 46 | ) -> ExperimentData: 47 | return data.set_value( 48 | space=ExperimentDataEnum.additional_fields, 49 | executor_id=self._ids_to_names(value.columns), 50 | value=value, 51 | role=value.roles, 52 | ) 53 | 54 | def execute(self, data: ExperimentData) -> ExperimentData: 55 | target_cols = data.ds.search_columns( 56 | roles=self.target_roles, search_types=self.search_types 57 | ) 58 | return self._set_value( 59 | data=data, 60 | value=self.calc(data=data.ds, target_cols=target_cols), 61 | key=self.key, 62 | ) 63 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | min_version = 3.28.0 3 | isolated_build = True 4 | envlist = 5 | py{38,39,310,311,312}, 6 | lint, 7 | docs, 8 | typing, 9 | build, 10 | codespell 11 | 12 | [tox:.package] 13 | basepython = python3 14 | 15 | [gh-actions] 16 | python = 17 | 3.8: py38 18 | 3.9: py39 19 | 3.10: py310 20 | 3.11: py311 21 | 3.12: py312 22 | 23 | [testenv] 24 | allowlist_externals = make 25 | package = wheel 26 | deps = 27 | .[all] # Install all dependencies from pyproject.toml 28 | pytest >= 6.2.5 29 | psutil >= 7.0.0 30 | alive-progress >= 3.1.0 31 | jsonschema >= 4.23.0 32 | 33 | 34 | commands = 35 | pytest {posargs} -v --basetemp="{envtmpdir}" --log-level=DEBUG 36 | 37 | [testenv:lint] 38 | skip_install = true 39 | description = Lint code using Ruff 40 | deps = 41 | ruff >= 0.3.0 42 | commands = 43 | ruff check . --ignore RUF003,C901 --fix 44 | 45 | [testenv:docs] 46 | description = Build documentation using Sphinx 47 | changedir = docs 48 | deps = 49 | sphinx >= 5.3.0 50 | sphinx-autodoc-typehints >= 1.19.5 51 | sphinx-rtd-theme >= 1.1.1 52 | nbsphinx >= 0.8.10 53 | nbsphinx-link >= 1.3.0 54 | doc8 >= 0.10.1 55 | rstcheck >= 3.3.1 56 | pandoc >= 2.0.1 57 | IPython >= 7.0 58 | commands = 59 | make clean html 60 | doc8 . --ignore-path _autosummary --ignore-path _build --ignore-path _templates 61 | 62 | [testenv:typing] 63 | skip_install = true 64 | description = Run type checks with mypy 65 | deps = 66 | mypy >= 0.991 67 | pandas-stubs 68 | pytest 69 | types-pytz 70 | types-tqdm 71 | types-requests 72 | types-pyyaml 73 | commands = 74 | mypy {posargs:. tests} 75 | 76 | [testenv:build] 77 | description = Build the project using Poetry 78 | deps = 79 | poetry >= 1.1.7 80 | commands = 81 | poetry build 82 | 83 | [testenv:codespell] 84 | skip_install = true 85 | description = Check for spelling errors 86 | deps = 87 | codespell >= 2.3.0 88 | commands = 89 | codespell --skip="docs,_build,imgs,schemes,poetry.lock" --ignore-words-list="dotA,TE" -------------------------------------------------------------------------------- /hypex/extensions/abstract.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import Any, Literal 5 | 6 | from ..dataset import ABCRole, Dataset 7 | from ..dataset.backends import PandasDataset 8 | from ..dataset.dataset import DatasetAdapter 9 | from ..utils.errors import AbstractMethodError 10 | 11 | 12 | class Extension(ABC): 13 | def __init__(self): 14 | self.BACKEND_MAPPING = { 15 | PandasDataset: self._calc_pandas, 16 | } 17 | 18 | @abstractmethod 19 | def _calc_pandas(self, data: Dataset, **kwargs): 20 | raise AbstractMethodError 21 | 22 | def calc(self, data: Dataset, **kwargs): 23 | return self.BACKEND_MAPPING[type(data.backend)](data=data, **kwargs) 24 | 25 | @staticmethod 26 | def result_to_dataset(result: Any, roles: ABCRole | dict[str, ABCRole]) -> Dataset: 27 | return DatasetAdapter.to_dataset(result, roles=roles) 28 | 29 | 30 | class CompareExtension(Extension, ABC): 31 | def calc(self, data: Dataset, other: Dataset | None = None, **kwargs): 32 | return super().calc(data=data, other=other, **kwargs) 33 | 34 | 35 | class MLExtension(Extension): 36 | # TODO: add model 37 | def _calc_pandas( 38 | self, 39 | data: Dataset, 40 | test_data: Dataset | None = None, 41 | mode: Literal["auto", "fit", "predict"] | None = None, 42 | **kwargs, 43 | ): 44 | if mode in ["auto", "fit"]: 45 | return self.fit(data, test_data, **kwargs) 46 | return self.predict(data) 47 | 48 | @abstractmethod 49 | def fit(self, X, Y=None, **kwargs): 50 | raise NotImplementedError 51 | 52 | @abstractmethod 53 | def predict(self, X, **kwargs): 54 | raise NotImplementedError 55 | 56 | def calc( 57 | self, 58 | data: Dataset, 59 | target_data: Dataset | None = None, 60 | test_data: Dataset | None = None, 61 | **kwargs, 62 | ): 63 | return super().calc( 64 | data=data, target_data=target_data, test_data=test_data, **kwargs 65 | ) 66 | -------------------------------------------------------------------------------- /hypex/transformers/category_agg.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Sequence 4 | 5 | from ..dataset.dataset import Dataset, ExperimentData 6 | from ..dataset.roles import FeatureRole 7 | from ..utils import CategoricalTypes 8 | from ..utils.adapter import Adapter 9 | from .abstract import Transformer 10 | 11 | 12 | class CategoryAggregator(Transformer): 13 | def __init__( 14 | self, 15 | target_roles: str | Sequence[str] | None = None, 16 | threshold: int | None = 15, 17 | new_group_name: str | None = None, 18 | key: Any = "", 19 | ): 20 | super().__init__(key=key) 21 | self.target_roles = target_roles or FeatureRole() 22 | self.threshold = threshold 23 | self.new_group_name = new_group_name 24 | 25 | @property 26 | def search_types(self): 27 | return [CategoricalTypes] 28 | 29 | @staticmethod 30 | def _inner_function( 31 | data: Dataset, 32 | target_cols: str | None = None, 33 | threshold: int | None = 15, 34 | new_group_name: str | None = None, 35 | ) -> Dataset: 36 | target_cols = Adapter.to_list(target_cols) 37 | for column in target_cols: 38 | categories_counts = data[column].value_counts() 39 | values_to_replace = categories_counts[ 40 | categories_counts["count"] < threshold 41 | ][column].get_values(column=column) 42 | data[column] = data[column].replace( 43 | to_replace=values_to_replace, value=new_group_name 44 | ) 45 | 46 | return data 47 | 48 | def execute(self, data: ExperimentData) -> ExperimentData: 49 | target_cols = data.ds.search_columns( 50 | roles=self.target_roles, search_types=self.search_types 51 | ) 52 | result = data.copy( 53 | data=self.calc( 54 | data=data.ds, 55 | target_cols=target_cols, 56 | threshold=self.threshold, 57 | new_group_name=self.new_group_name, 58 | ) 59 | ) 60 | return result 61 | -------------------------------------------------------------------------------- /hypex/ui/aa.py: -------------------------------------------------------------------------------- 1 | from ..analyzers.aa import AAScoreAnalyzer 2 | from ..dataset import Dataset, ExperimentData 3 | from ..reporters.aa import AABestSplitReporter, AAPassedReporter 4 | from ..utils import ExperimentDataEnum 5 | from ..utils.enums import RenameEnum 6 | from .base import Output 7 | 8 | 9 | class AAOutput(Output): 10 | best_split: Dataset 11 | experiments: Dataset 12 | aa_score: Dataset 13 | best_split_statistic: Dataset 14 | 15 | def __init__(self): 16 | super().__init__( 17 | resume_reporter=AAPassedReporter(), 18 | additional_reporters={"best_split": AABestSplitReporter()}, 19 | ) 20 | 21 | def _extract_experiments(self, experiment_data: ExperimentData): 22 | id_ = experiment_data.get_one_id( 23 | "ParamsExperiment", ExperimentDataEnum.analysis_tables, "AATest" 24 | ) 25 | self.experiments = self._replace_splitters( 26 | experiment_data.analysis_tables[id_], RenameEnum.columns 27 | ) 28 | 29 | def _extract_aa_score(self, experiment_data: ExperimentData): 30 | def get_analyzer_id(key: str): 31 | target_id = [i for i in aa_score_analyser_ids if i.endswith(key)] 32 | if len(target_id): 33 | return target_id[0] 34 | else: 35 | raise ValueError("Result of AAScoreAnalyzer does not found.") 36 | 37 | aa_score_analyser_ids = experiment_data.get_ids( 38 | AAScoreAnalyzer, ExperimentDataEnum.analysis_tables 39 | )[AAScoreAnalyzer.__name__][ExperimentDataEnum.analysis_tables.value] 40 | 41 | self.aa_score = experiment_data.analysis_tables[get_analyzer_id("aa score")] 42 | self.aa_score = self._replace_splitters(self.aa_score, RenameEnum.index) 43 | 44 | self.best_split_statistic = experiment_data.analysis_tables[ 45 | get_analyzer_id("best split statistics") 46 | ] 47 | 48 | def extract(self, experiment_data: ExperimentData): 49 | super().extract(experiment_data) 50 | self._extract_experiments(experiment_data) 51 | self._extract_aa_score(experiment_data) 52 | -------------------------------------------------------------------------------- /hypex/comparators/hypothesis_testing.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ..dataset import Dataset 4 | from ..extensions.scipy_stats import ( 5 | Chi2TestExtension, 6 | KSTestExtension, 7 | TTestExtension, 8 | UTestExtension, 9 | ) 10 | from ..utils.constants import NUMBER_TYPES_LIST 11 | from .abstract import StatHypothesisTesting 12 | 13 | 14 | class TTest(StatHypothesisTesting): 15 | @property 16 | def search_types(self) -> list[type] | None: 17 | return NUMBER_TYPES_LIST 18 | 19 | @classmethod 20 | def _inner_function( 21 | cls, data: Dataset, test_data: Dataset | None = None, **kwargs 22 | ) -> Dataset: 23 | return TTestExtension(kwargs.get("reliability", 0.05)).calc( 24 | data, other=test_data, **kwargs 25 | ) 26 | 27 | 28 | class KSTest(StatHypothesisTesting): 29 | @property 30 | def search_types(self) -> list[type] | None: 31 | return NUMBER_TYPES_LIST 32 | 33 | @classmethod 34 | def _inner_function( 35 | cls, data: Dataset, test_data: Dataset | None = None, **kwargs 36 | ) -> Dataset: 37 | return KSTestExtension(kwargs.get("reliability", 0.05)).calc( 38 | data, other=test_data, **kwargs 39 | ) 40 | 41 | 42 | class UTest(StatHypothesisTesting): 43 | @property 44 | def search_types(self) -> list[type] | None: 45 | return NUMBER_TYPES_LIST 46 | 47 | @classmethod 48 | def _inner_function( 49 | cls, data: Dataset, test_data: Dataset | None = None, **kwargs 50 | ) -> Dataset: 51 | return UTestExtension(kwargs.get("reliability", 0.05)).calc( 52 | data, other=test_data, **kwargs 53 | ) 54 | 55 | 56 | class Chi2Test(StatHypothesisTesting): 57 | @property 58 | def search_types(self) -> list[type] | None: 59 | return [str] 60 | 61 | @classmethod 62 | def _inner_function( 63 | cls, data: Dataset, test_data: Dataset | None = None, **kwargs 64 | ) -> Dataset: 65 | return Chi2TestExtension(reliability=kwargs.get("reliability", 0.05)).calc( 66 | data, other=test_data, **kwargs 67 | ) 68 | -------------------------------------------------------------------------------- /hypex/transformers/na_filler.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Literal, Sequence 4 | 5 | from ..dataset.dataset import Dataset, ExperimentData 6 | from ..dataset.roles import FeatureRole 7 | from ..utils import ScalarType 8 | from ..utils.adapter import Adapter 9 | from .abstract import Transformer 10 | 11 | 12 | class NaFiller(Transformer): 13 | def __init__( 14 | self, 15 | target_roles: str | Sequence[str] | None = None, 16 | values: ScalarType | dict[str, ScalarType] | None = None, 17 | method: Literal["bfill", "ffill"] | None = None, 18 | key: Any = "", 19 | ): 20 | """ 21 | Initializes a NaFiller object. 22 | 23 | Args: 24 | target_roles (Optional[Union[str, Sequence[str]]], optional): The roles of the target columns. Defaults to None. 25 | key (Any, optional): The key for the NaFiller object. Defaults to "". 26 | values (Union[ScalarType, Dict[str, ScalarType]], optional): The values to fill missing values with. Defaults to None. 27 | method (Literal["bfill", "ffill"], optional): The method to fill missing values. Defaults to None. 28 | 29 | Returns: 30 | None 31 | """ 32 | 33 | super().__init__(key=key) 34 | self.target_roles = target_roles or FeatureRole() 35 | self.values = values 36 | self.method = method 37 | 38 | @staticmethod 39 | def _inner_function( 40 | data: Dataset, 41 | target_cols: str | None = None, 42 | values: ScalarType | dict[str, ScalarType] | None = None, 43 | method: Literal["bfill", "ffill"] | None = None, 44 | ) -> Dataset: 45 | target_cols = Adapter.to_list(target_cols) 46 | for column in target_cols: 47 | value = values[column] if isinstance(values, dict) else values 48 | data[column] = data[column].fillna(values=value, method=method) 49 | return data 50 | 51 | def execute(self, data: ExperimentData) -> ExperimentData: 52 | target_cols = data.ds.search_columns(roles=self.target_roles) 53 | result = data.copy( 54 | data=self.calc( 55 | data=data.ds, 56 | target_cols=target_cols, 57 | values=self.values, 58 | method=self.method, 59 | ) 60 | ) 61 | return result 62 | -------------------------------------------------------------------------------- /hypex/homogeneity.py: -------------------------------------------------------------------------------- 1 | from .analyzers.aa import OneAAStatAnalyzer 2 | from .comparators import Chi2Test, GroupDifference, GroupSizes, KSTest, TTest 3 | from .dataset import TargetRole, TreatmentRole 4 | from .experiments.base import Experiment, OnRoleExperiment 5 | from .ui.base import ExperimentShell 6 | from .ui.homo import HomoOutput 7 | 8 | HOMOGENEITY_TEST = Experiment( 9 | executors=[ 10 | OnRoleExperiment( 11 | executors=[ 12 | GroupSizes(grouping_role=TreatmentRole(), compare_by="groups"), 13 | GroupDifference(grouping_role=TreatmentRole(), compare_by="groups"), 14 | TTest(grouping_role=TreatmentRole(), compare_by="groups"), 15 | KSTest(grouping_role=TreatmentRole(), compare_by="groups"), 16 | Chi2Test(grouping_role=TreatmentRole(), compare_by="groups"), 17 | ], 18 | role=TargetRole(), 19 | ), 20 | OneAAStatAnalyzer(), 21 | ] 22 | ) 23 | 24 | 25 | class HomogeneityTest(ExperimentShell): 26 | """A class for conducting homogeneity tests between the groups. 27 | 28 | This class provides functionality to test whether treatment and control groups are 29 | homogeneous across target variables using multiple statistical tests including t-test, 30 | Kolmogorov-Smirnov test, and chi-square test. 31 | 32 | The class runs the following analyses: 33 | - Group size comparisons 34 | - Group differences 35 | - T-test for continuous variables 36 | - KS-test for distribution comparisons 37 | - Chi-square test for categorical variables 38 | - AA statistics analysis 39 | 40 | Examples 41 | -------- 42 | .. code-block:: python 43 | 44 | # Basic homogeneity test 45 | homo_test = HomogeneityTest() 46 | results = homo_test.execute(data) 47 | 48 | # Accessing specific test results 49 | homo_test = HomogeneityTest() 50 | results = homo_test.execute(data) 51 | output = results.resume 52 | 53 | # Running test on dataset with roles 54 | from hypex.dataset import Dataset, TargetRole, TreatmentRole 55 | ds = Dataset( 56 | roles={ 57 | 'treatment': TreatmentRole(), 58 | 'outcome': TargetRole() 59 | }, 60 | data=df 61 | ) 62 | homo_test = HomogeneityTest() 63 | results = homo_test.execute(ds) 64 | """ 65 | 66 | def __init__(self): 67 | """Initialize HomogeneityTest with default experiment and output configurations.""" 68 | super().__init__( 69 | experiment=HOMOGENEITY_TEST, 70 | output=HomoOutput(), 71 | ) 72 | -------------------------------------------------------------------------------- /hypex/ui/ab.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from ..analyzers.ab import ABAnalyzer 4 | from ..comparators import GroupDifference, GroupSizes 5 | from ..dataset import Dataset, ExperimentData, StatisticRole, TreatmentRole 6 | from ..reporters.ab import ABDatasetReporter 7 | from ..utils import ID_SPLIT_SYMBOL, ExperimentDataEnum 8 | from .base import Output 9 | 10 | 11 | class ABOutput(Output): 12 | multitest: Union[Dataset, str] 13 | sizes: Dataset 14 | 15 | def __init__(self): 16 | self._groups = [] 17 | super().__init__(resume_reporter=ABDatasetReporter()) 18 | 19 | def _extract_multitest_result(self, experiment_data: ExperimentData): 20 | multitest_id = experiment_data.get_one_id( 21 | ABAnalyzer, ExperimentDataEnum.analysis_tables 22 | ) 23 | if multitest_id and "MultiTest" in multitest_id: 24 | self.multitest = experiment_data.analysis_tables[multitest_id] 25 | else: 26 | self.multitest = ( 27 | "There was less than three groups or multitest method wasn't provided" 28 | ) 29 | 30 | def _extract_differences(self, experiment_data: ExperimentData): 31 | targets = [] 32 | groups = [] 33 | ids = experiment_data.get_ids( 34 | GroupDifference, 35 | searched_space=ExperimentDataEnum.analysis_tables, 36 | )["GroupDifference"]["analysis_tables"] 37 | self._groups = list( 38 | experiment_data.groups[ 39 | experiment_data.ds.search_columns(TreatmentRole())[0] 40 | ].keys() 41 | )[1:] 42 | for i in self._groups: 43 | groups += [i] * len(ids) 44 | diff = Dataset.create_empty() 45 | for i in range(len(ids)): 46 | diff = diff.append(experiment_data.analysis_tables[ids[i]]) 47 | targets += [ids[i].split(ID_SPLIT_SYMBOL)[-1]] 48 | return diff.add_column(groups, role={"group": StatisticRole()}).add_column( 49 | targets * len(self._groups), role={"feature": StatisticRole()} 50 | ) 51 | 52 | def _extract_sizes(self, experiment_data: ExperimentData): 53 | ids = experiment_data.get_ids( 54 | GroupSizes, 55 | searched_space=ExperimentDataEnum.analysis_tables, 56 | )["GroupSizes"]["analysis_tables"] 57 | self.sizes = experiment_data.analysis_tables[ids[0]].add_column( 58 | self._groups, role={"group": StatisticRole()} 59 | ) 60 | 61 | def extract(self, experiment_data: ExperimentData): 62 | super().extract(experiment_data) 63 | self._extract_differences(experiment_data) 64 | self._extract_multitest_result(experiment_data) 65 | self._extract_sizes(experiment_data) 66 | -------------------------------------------------------------------------------- /hypex/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .constants import ( 2 | ID_SPLIT_SYMBOL, 3 | MATCHING_INDEXES_SPLITTER_SYMBOL, 4 | NAME_BORDER_SYMBOL, 5 | NUMBER_TYPES_LIST, 6 | ) 7 | from .enums import ABNTestMethodsEnum, BackendsEnum, ExperimentDataEnum, SpaceEnum 8 | from .errors import ( 9 | AbstractMethodError, 10 | BackendTypeError, 11 | ConcatBackendError, 12 | ConcatDataError, 13 | DataTypeError, 14 | MergeOnError, 15 | NoColumnsError, 16 | NoRequiredArgumentError, 17 | NotFoundInExperimentDataError, 18 | NotSuitableFieldError, 19 | RoleColumnError, 20 | SpaceError, 21 | ) 22 | from .tutorial_data_creation import ( 23 | create_test_data, 24 | gen_control_variates_df, 25 | gen_oracle_df, 26 | gen_special_medicine_df, 27 | ) 28 | from .typings import ( 29 | CategoricalTypes, 30 | DecoratedType, 31 | DefaultRoleTypes, 32 | DocstringInheritDecorator, 33 | FromDictTypes, 34 | GroupingDataType, 35 | MultiFieldKeyTypes, 36 | RoleNameType, 37 | ScalarType, 38 | SetParamsDictTypes, 39 | StratificationRoleTypes, 40 | TargetRoleTypes, 41 | ) 42 | 43 | __all__ = [ 44 | "ID_SPLIT_SYMBOL", 45 | "MATCHING_INDEXES_SPLITTER_SYMBOL", 46 | "NAME_BORDER_SYMBOL", 47 | "NUMBER_TYPES_LIST", 48 | "ABNTestMethodsEnum", 49 | "AbstractMethodError", 50 | "BackendTypeError", 51 | "BackendsEnum", 52 | "CategoricalTypes", 53 | "ConcatBackendError", 54 | "ConcatDataError", 55 | "DataTypeError", 56 | "DecoratedType", 57 | "DefaultRoleTypes", 58 | "DocstringInheritDecorator", 59 | "ExperimentDataEnum", 60 | "FromDictTypes", 61 | "GroupingDataType", 62 | "MergeOnError", 63 | "MultiFieldKeyTypes", 64 | "NoColumnsError", 65 | "NoRequiredArgumentError", 66 | "NotFoundInExperimentDataError", 67 | "NotSuitableFieldError", 68 | "RoleColumnError", 69 | "RoleNameType", 70 | "ScalarType", 71 | "SetParamsDictTypes", 72 | "SpaceEnum", 73 | "SpaceError", 74 | "StratificationRoleTypes", 75 | "TargetRoleTypes", 76 | "create_test_data", 77 | "gen_control_variates_df", 78 | "gen_oracle_df", 79 | "gen_special_medicine_df", 80 | ] 81 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "HypEx" 3 | version = "1.0.2" 4 | description = "Fast and customizable framework for Causal Inference" 5 | authors = [ 6 | "Dmitry Tikhomirov ", 7 | "Dmitry Bulychev ", 8 | "Ivan Yurashku ", 9 | "Anton Katkov ", 10 | "Ruslan Alsherov ", 11 | "Ksenia Vasilieva ", 12 | "Anastasiia Fedorova " 13 | ] 14 | readme = "README.md" 15 | license = "Apache-2.0" 16 | repository = "https://github.com/sb-ai-lab/HypEx" 17 | classifiers = [ 18 | "Programming Language :: Python :: 3 :: Only", 19 | "Programming Language :: Python :: 3.8", 20 | "Programming Language :: Python :: 3.9", 21 | "Programming Language :: Python :: 3.10", 22 | "Programming Language :: Python :: 3.11", 23 | "Programming Language :: Python :: 3.12", 24 | "Operating System :: OS Independent", 25 | "Intended Audience :: Science/Research", 26 | "Development Status :: 4 - Beta", 27 | "Environment :: Console", 28 | "Natural Language :: English", 29 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 30 | "Topic :: Scientific/Engineering :: Mathematics", 31 | "Typing :: Typed", 32 | ] 33 | 34 | [tool.poetry.dependencies] 35 | python = ">=3.8, <3.13" 36 | tqdm = "*" 37 | scikit-learn = "*" 38 | 39 | pandas = [ 40 | { version = ">=1.3.5, <=2.0.3", python = "<3.9" }, 41 | { version = ">=1.3.5, <=2.2.3", python = ">=3.9" } 42 | ] 43 | 44 | numpy = [ 45 | { version = ">=1.17.0, <=1.24.4", python = "<3.9" }, 46 | { version = ">=1.17.0, <=1.26.4", python = ">=3.9" } 47 | ] 48 | 49 | scipy = [ 50 | { version = ">=1.5.0, <=1.10.1", python = "<3.9" }, 51 | { version = ">=1.5.0, <=1.13.1", python = ">=3.9" } 52 | ] 53 | 54 | matplotlib = [ 55 | { version = ">=3.0.0, <=3.7.3", python = "<3.9" }, 56 | { version = ">=3.0.0, <=3.9.0", python = ">=3.9" } 57 | ] 58 | 59 | faiss-cpu = ">=1.6.0, <=1.8.0" 60 | seaborn = "<=0.13.2" 61 | statsmodels = "<=0.14.2" 62 | 63 | [tool.poetry.extras] 64 | cat = ["catboost"] 65 | lgbm = ["lightgbm"] 66 | all = ["catboost", "lightgbm"] 67 | 68 | [tool.poetry.group.dev.dependencies] 69 | docutils = ">=0.17,<0.21" 70 | jupyter = "^1.0.0" 71 | pytest = "^7.4.3" 72 | sphinx = { version = "^7.2.6", python = ">=3.9, <3.11" } 73 | nbsphinx = "*" 74 | nbsphinx_link = "*" 75 | sphinx_autodoc-typehints = "*" 76 | sphinx_rtd_theme = "^1.2.2" 77 | ruff = "*" 78 | alive-progress = "^3.1.0" 79 | psutil = "^7.0.0" 80 | jsonschema = "^4.23.0" 81 | 82 | 83 | [build-system] 84 | requires = ["poetry-core"] 85 | build-backend = "poetry.core.masonry.api" 86 | 87 | [tool.ruff] 88 | line-length = 88 89 | target-version = "py38" 90 | 91 | [tool.ruff.lint] 92 | select = ["E", "F", "W", "C90", "UP", "RUF", "I"] 93 | ignore = ["E501", "RUF003", "C901"] 94 | -------------------------------------------------------------------------------- /hypex/extensions/faiss.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Literal 4 | 5 | import faiss # type: ignore 6 | import numpy as np 7 | import pandas as pd # type: ignore 8 | 9 | from ..dataset import AdditionalMatchingRole, Dataset 10 | from .abstract import MLExtension 11 | 12 | 13 | class FaissExtension(MLExtension): 14 | def __init__( 15 | self, n_neighbors: int = 1, faiss_mode: Literal["base", "fast", "auto"] = "auto" 16 | ): 17 | self.n_neighbors = n_neighbors 18 | self.faiss_mode = faiss_mode 19 | super().__init__() 20 | 21 | @staticmethod 22 | def _prepare_indexes(index: np.ndarray, dist: np.ndarray, k: int): 23 | new = [ 24 | np.concatenate( 25 | [val[np.where(dist[i] == d)[0]] for d in sorted(set(dist[i]))[:k]] 26 | ) 27 | for i, val in enumerate(index) 28 | ] 29 | return new 30 | 31 | def _predict(self, data: Dataset, test_data: Dataset, X: np.ndarray) -> pd.Series: 32 | dist, indexes = self.index.search(X, k=self.n_neighbors) 33 | if self.n_neighbors == 1: 34 | equal_dist = list(map(lambda x: np.where(x == x[0])[0], dist)) 35 | indexes = [ 36 | ( 37 | int(index[dist][0]) 38 | if abs(index[dist][0]) <= len(data) + len(test_data) 39 | else -1 40 | ) 41 | for index, dist in zip(indexes, equal_dist) 42 | ] 43 | else: 44 | indexes = self._prepare_indexes(indexes, dist, self.n_neighbors) 45 | return pd.Series(indexes) 46 | 47 | def _calc_pandas( 48 | self, 49 | data: Dataset, 50 | test_data: Dataset | None = None, 51 | mode: Literal["auto", "fit", "predict"] | None = None, 52 | **kwargs, 53 | ): 54 | mode = mode or "auto" 55 | X = data.data.values 56 | test = test_data.data.values 57 | if mode in ["auto", "fit"]: 58 | self.index = faiss.IndexFlatL2(X.shape[1]) 59 | if (( 60 | len(X) > 1_000_000 and self.faiss_mode == "auto" 61 | ) or self.faiss_mode == "fast" 62 | ) and len(X) > 1_000 and len(test) > 1_000: 63 | self.index = faiss.IndexIVFFlat(self.index, X.shape[1], 1000) 64 | self.index.train(X) 65 | self.index.add(X) 66 | if mode in ["auto", "predict"]: 67 | if test_data is None: 68 | raise ValueError("test_data is needed for evaluation") 69 | X = test_data.data.values if mode == "auto" else data.data.values 70 | return self._predict(data, test_data, X) 71 | return self 72 | 73 | def fit(self, X: Dataset, Y: Dataset | None = None, **kwargs): 74 | return super().calc(X, target_data=Y, mode="fit", **kwargs) 75 | 76 | def predict(self, X: Dataset, **kwargs) -> Dataset: 77 | return self.result_to_dataset( 78 | super().calc(X, mode="predict", **kwargs), AdditionalMatchingRole() 79 | ) 80 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Files 2 | *.csv 3 | *.png 4 | *.pickle 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # DS_store 15 | .DS_Store 16 | 17 | # Distribution / packaging 18 | .Python 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | tabularAutoML_model_report/ 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .nox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | *.py,cover 59 | .hypothesis/ 60 | .pytest_cache/ 61 | cover/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | db.sqlite3-journal 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | docs/_autosummary/ 83 | 84 | # PyBuilder 85 | .pybuilder/ 86 | target/ 87 | 88 | # Jupyter Notebook 89 | .ipynb_checkpoints 90 | 91 | # IPython 92 | profile_default/ 93 | ipython_config.py 94 | 95 | # pyenv 96 | # For a library or package, you might want to ignore these files since the code is 97 | # intended to run in multiple environments; otherwise, check them in: 98 | # .python-version 99 | 100 | # pipenv 101 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 102 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 103 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 104 | # install all needed dependencies. 105 | #Pipfile.lock 106 | 107 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 108 | __pypackages__/ 109 | 110 | # Celery stuff 111 | celerybeat-schedule 112 | celerybeat.pid 113 | 114 | # SageMath parsed files 115 | *.sage.py 116 | 117 | # Environments 118 | .env 119 | .venv 120 | env/ 121 | venv/ 122 | ENV/ 123 | env.bak/ 124 | venv.bak/ 125 | 126 | # Spyder project settings 127 | .spyderproject 128 | .spyproject 129 | 130 | # Rope project settings 131 | .ropeproject 132 | 133 | # VSCode 134 | .vscode 135 | 136 | # mkdocs documentation 137 | /site 138 | 139 | # mypy 140 | .mypy_cache/ 141 | .dmypy.json 142 | dmypy.json 143 | 144 | # Pyre type checker 145 | .pyre/ 146 | 147 | # pytype static type analyzers 148 | .pytype/ 149 | 150 | # Cython debug symbols 151 | cython_debug/ 152 | 153 | # VSCode 154 | .vscode/ 155 | 156 | .idea/ 157 | lama_venv/ 158 | *.db 159 | 160 | temp/ 161 | 162 | poetry.lock 163 | -------------------------------------------------------------------------------- /hypex/factory/base.py: -------------------------------------------------------------------------------- 1 | # import sys 2 | # 3 | # from ..analyzers import ABAnalyzer, OneAAStatAnalyzer 4 | # from ..comparators import GroupDifference, GroupSizes, ATE, TTest, KSTest, UTest 5 | # from ..dataset import ( 6 | # ExperimentData, 7 | # Arg1Role, 8 | # Arg2Role, 9 | # InfoRole, 10 | # TargetRole, 11 | # FeatureRole, 12 | # GroupingRole, 13 | # PreTargetRole, 14 | # StatisticRole, 15 | # StratificationRole, 16 | # TreatmentRole, 17 | # TempTreatmentRole, 18 | # TempGroupingRole, 19 | # TempTargetRole, 20 | # ) 21 | # from ..experiments import ( 22 | # Experiment, 23 | # OnRoleExperiment, 24 | # GroupExperiment, 25 | # CycledExperiment, 26 | # ) 27 | # from ..reporters import OneAADictReporter 28 | # from ..transformers import Shuffle 29 | # from ..utils import ExperimentDataEnum, SpaceEnum 30 | # 31 | # all_classes = [ 32 | # ABAnalyzer, 33 | # OneAAStatAnalyzer, 34 | # GroupDifference, 35 | # GroupSizes, 36 | # ATE, 37 | # TTest, 38 | # KSTest, 39 | # UTest, 40 | # Arg1Role, 41 | # Arg2Role, 42 | # InfoRole, 43 | # TargetRole, 44 | # FeatureRole, 45 | # GroupingRole, 46 | # PreTargetRole, 47 | # StatisticRole, 48 | # StratificationRole, 49 | # TreatmentRole, 50 | # TempTreatmentRole, 51 | # TempGroupingRole, 52 | # TempTargetRole, 53 | # OnRoleExperiment, 54 | # GroupExperiment, 55 | # CycledExperiment, 56 | # OneAADictReporter, 57 | # Shuffle, 58 | # ExperimentDataEnum, 59 | # SpaceEnum, 60 | # ] 61 | # 62 | # spaces = { 63 | # "additional": SpaceEnum.additional, 64 | # "auto": SpaceEnum.auto, 65 | # "data": SpaceEnum.data, 66 | # } 67 | # 68 | # 69 | # class Factory: 70 | # def __init__(self, hypothesis): 71 | # self.hypothesis = hypothesis 72 | # 73 | # def make_experiment(self, experiment): 74 | # executors = [] 75 | # for key, items in experiment.items(): 76 | # class_ = getattr(sys.modules[__name__], key) 77 | # if "executors" in items or "inner_executors" in items: 78 | # item = "executors" if "executors" in items else "inner_executors" 79 | # items[f"{item}"] = self.make_experiment(experiment[key][f"{item}"][0]) 80 | # if "role" in items or "grouping_role" in items: 81 | # item = "role" if "role" in items else "grouping_role" 82 | # items[f"{item}"] = getattr( 83 | # sys.modules[__name__], items[item] + "Role" 84 | # )() 85 | # if "space" in items: 86 | # items["space"] = spaces.get(items["space"]) 87 | # items = {i: None if j == "None" else j for i, j in items.items()} 88 | # executors.append(class_(**items)) 89 | # return executors 90 | # 91 | # def execute(self): 92 | # experiment_data = ExperimentData(self.hypothesis.dataset) 93 | # experiment = Experiment( 94 | # executors=self.make_experiment(self.hypothesis.experiment) 95 | # ) 96 | # return experiment_data, experiment 97 | -------------------------------------------------------------------------------- /hypex/utils/errors.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | 3 | 4 | class RoleColumnError(Exception): 5 | def __init__(self, roles, columns): 6 | super().__init__( 7 | "Check your roles. All of them must be names of data columns. \n" 8 | f"Now roles have {roles} values and columns have {columns} values" 9 | ) 10 | 11 | 12 | class ConcatDataError(Exception): 13 | def __init__(self, data_type): 14 | super().__init__(f"Can only append Dataset to Dataset. Got {data_type}") 15 | 16 | 17 | class ConcatBackendError(Exception): 18 | def __init__(self, other_backend, backend): 19 | super().__init__( 20 | f"Can only append data with the same backends. Got {other_backend} expected {backend}" 21 | ) 22 | 23 | 24 | class SpaceError(Exception): 25 | def __init__(self, space): 26 | super().__init__(f"{space} is not a valid space") 27 | 28 | 29 | class NoColumnsError(Exception): 30 | def __init__(self, role): 31 | super().__init__(f"No columns found by role {role}") 32 | 33 | 34 | class NotSuitableFieldError(Exception): 35 | def __init__(self, field, field_role: Literal["Grouping", "Target", "Baseline"]): 36 | super().__init__(f"{field_role} field {field} is not suitable for comparison") 37 | 38 | 39 | class NotFoundInExperimentDataError(Exception): 40 | def __init__(self, class_: str): 41 | super().__init__(f"{class_} id is not found in ExperimentData") 42 | 43 | 44 | class AbstractMethodError(NotImplementedError): 45 | def __init__(self): 46 | super().__init__( 47 | "This method is abstract and will be overridden in derived class." 48 | ) 49 | 50 | 51 | class DataTypeError(Exception): 52 | def __init__(self, data_type): 53 | super().__init__( 54 | f"Can only perform the operation for Dataset and Dataset. Got {data_type}" 55 | ) 56 | 57 | 58 | class BackendTypeError(Exception): 59 | def __init__(self, other_backend, backend): 60 | super().__init__( 61 | f"Can only perform the operation with the same backends. Got {other_backend} expected {backend}" 62 | ) 63 | 64 | 65 | class MergeOnError(Exception): 66 | def __init__(self, on): 67 | super().__init__(f"Can only merge on one of the columns data. Got {on}") 68 | 69 | 70 | class NoRequiredArgumentError(Exception): 71 | def __init__(self, argument_name): 72 | super().__init__(f"The required argument {argument_name} has not been passed.") 73 | 74 | 75 | class NoneArgumentError(Exception): 76 | def __init__(self, arg, process): 77 | super().__init__(f"Argument {arg} is None in process {process}.") 78 | 79 | 80 | class InvalidArgumentError(Exception): 81 | def __init__(self, arg, possible_type): 82 | super().__init__( 83 | f"Invalid type for argument {arg}, possible type is is {possible_type}." 84 | ) 85 | 86 | 87 | class PairsNotFoundError(Exception): 88 | def __init__(self): 89 | super().__init__( 90 | "Pairs are not found. Check your input data and try execute preprocessing pipeline before matching estimation." 91 | ) 92 | -------------------------------------------------------------------------------- /docs/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* Улучшенные цвета и стили */ 2 | .wy-nav-content { 3 | max-width: 1200px; 4 | } 5 | 6 | /* Красивые заголовки */ 7 | h1 { 8 | color: #2980B9; 9 | border-bottom: 2px solid #2980B9; 10 | padding-bottom: 10px; 11 | } 12 | 13 | h2 { 14 | color: #34495e; 15 | margin-top: 30px; 16 | } 17 | 18 | /* Стилизация блоков кода */ 19 | .highlight { 20 | background: #f8f8f8 !important; 21 | border: 1px solid #e1e4e8; 22 | border-radius: 6px; 23 | margin: 1em 0; 24 | } 25 | 26 | /* Красивые примечания */ 27 | .admonition { 28 | border-radius: 6px; 29 | padding: 12px; 30 | border-left: 4px solid; 31 | } 32 | 33 | .admonition.note { 34 | background-color: #e3f2fd; 35 | border-left-color: #2196f3; 36 | } 37 | 38 | .admonition.warning { 39 | background-color: #fff3cd; 40 | border-left-color: #ffc107; 41 | } 42 | 43 | .admonition.tip { 44 | background-color: #e8f5e9; 45 | border-left-color: #4caf50; 46 | } 47 | 48 | /* Улучшенные таблицы */ 49 | .wy-table-responsive table td, .wy-table-responsive table th { 50 | white-space: normal; 51 | } 52 | 53 | table.docutils { 54 | border: 1px solid #e1e4e8; 55 | border-collapse: collapse; 56 | border-spacing: 0; 57 | width: 100%; 58 | } 59 | 60 | table.docutils tr:nth-child(2n) { 61 | background-color: #f6f8fa; 62 | } 63 | 64 | /* Красивые кнопки */ 65 | .btn { 66 | display: inline-block; 67 | padding: 6px 12px; 68 | margin-bottom: 0; 69 | font-size: 14px; 70 | font-weight: 400; 71 | line-height: 1.42857143; 72 | text-align: center; 73 | white-space: nowrap; 74 | vertical-align: middle; 75 | cursor: pointer; 76 | border: 1px solid transparent; 77 | border-radius: 4px; 78 | background-color: #2980B9; 79 | color: white; 80 | text-decoration: none; 81 | } 82 | 83 | .btn:hover { 84 | background-color: #21618C; 85 | color: white; 86 | } 87 | 88 | /* API Reference styling */ 89 | .class dt { 90 | font-size: 1.1em; 91 | font-weight: bold; 92 | margin-top: 20px; 93 | padding: 10px; 94 | background-color: #f0f0f0; 95 | border-left: 3px solid #2980B9; 96 | } 97 | 98 | /* Parameters styling */ 99 | .field-list { 100 | margin: 20px 0; 101 | } 102 | 103 | .field-name { 104 | font-weight: bold; 105 | min-width: 100px; 106 | padding-right: 20px; 107 | } 108 | 109 | /* Навигация */ 110 | .wy-menu-vertical li.current > a { 111 | background: #fcfcfc; 112 | border-right: solid 3px #2980B9; 113 | } 114 | 115 | .wy-menu-vertical li.current a { 116 | color: #404040; 117 | } 118 | 119 | /* Badges */ 120 | .badge { 121 | display: inline-block; 122 | padding: 3px 7px; 123 | font-size: 12px; 124 | font-weight: bold; 125 | line-height: 1; 126 | color: #fff; 127 | text-align: center; 128 | white-space: nowrap; 129 | vertical-align: baseline; 130 | border-radius: 10px; 131 | } 132 | 133 | .badge-primary { 134 | background-color: #2980B9; 135 | } 136 | 137 | .badge-success { 138 | background-color: #27ae60; 139 | } 140 | 141 | .badge-warning { 142 | background-color: #f39c12; 143 | } -------------------------------------------------------------------------------- /hypex/experiments/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from copy import deepcopy 4 | from typing import Any, Iterable, Sequence 5 | 6 | from ..dataset import ABCRole, ExperimentData, TempTargetRole 7 | from ..executor import Executor 8 | from ..utils import ExperimentDataEnum 9 | 10 | 11 | class Experiment(Executor): 12 | def _detect_transformer(self) -> bool: 13 | return all(executor._is_transformer for executor in self.executors) 14 | 15 | def get_executor_ids( 16 | self, searched_classes: type | Iterable[type] | None = None 17 | ) -> dict[type, list[str]]: 18 | if not searched_classes: 19 | return {} 20 | 21 | searched_classes = ( 22 | searched_classes 23 | if isinstance(searched_classes, Iterable) 24 | else [searched_classes] 25 | ) 26 | return { 27 | searched_class: [ 28 | executor.id 29 | for executor in self.executors 30 | if isinstance(executor, searched_class) 31 | ] 32 | for searched_class in searched_classes 33 | } 34 | 35 | def __init__( 36 | self, 37 | executors: Sequence[Executor], 38 | transformer: bool | None = None, 39 | key: Any = "", 40 | ): 41 | self.executors: Sequence[Executor] = executors 42 | self.transformer: bool = ( 43 | transformer if transformer is not None else self._detect_transformer() 44 | ) 45 | super().__init__(key) 46 | 47 | def set_params(self, params: dict[str, Any] | dict[type, dict[str, Any]]) -> None: 48 | if isinstance(next(iter(params)), str): 49 | super().set_params(params) 50 | elif isinstance(next(iter(params)), type): 51 | for executor in self.executors: 52 | executor.set_params(params) 53 | else: 54 | raise ValueError( 55 | "params must be a dict of str to dict or a dict of class to dict" 56 | ) 57 | 58 | def _set_value(self, data: ExperimentData, value, key=None) -> ExperimentData: 59 | return data.set_value(ExperimentDataEnum.analysis_tables, self.id, value) 60 | 61 | def execute(self, data: ExperimentData) -> ExperimentData: 62 | experiment_data = deepcopy(data) if self.transformer else data 63 | for executor in self.executors: 64 | executor.key = self.key 65 | experiment_data = executor.execute(experiment_data) 66 | return experiment_data 67 | 68 | 69 | class OnRoleExperiment(Experiment): 70 | def __init__( 71 | self, 72 | executors: list[Executor], 73 | role: ABCRole | Sequence[ABCRole], 74 | transformer: bool | None = None, 75 | key: Any = "", 76 | ): 77 | self.role: list[ABCRole] = [role] if isinstance(role, ABCRole) else list(role) 78 | super().__init__(executors, transformer, key) 79 | 80 | def execute(self, data: ExperimentData) -> ExperimentData: 81 | for field in data.ds.search_columns(self.role): 82 | data.ds.tmp_roles = {field: TempTargetRole()} 83 | data = super().execute(data) 84 | data.ds.tmp_roles = {} 85 | return data 86 | -------------------------------------------------------------------------------- /hypex/hypotheses/hypothesis.py: -------------------------------------------------------------------------------- 1 | # import json 2 | # from typing import Optional, Union, Dict, Any 3 | # 4 | # from jsonschema import validate # type: ignore 5 | # 6 | # from hypex.dataset import Dataset, default_roles, InfoRole 7 | # from hypex.factory.base import Factory 8 | # 9 | # 10 | # class Hypothesis: 11 | # def __init__(self, config: Union[str, Dict[str, Any]]): 12 | # if isinstance(config, str): 13 | # with open(config, "rb") as file: 14 | # opened_config = json.load(file) 15 | # else: 16 | # opened_config = config 17 | # with open("hypex\\hypotheses\\schemes\\scheme.json", "rb") as file: 18 | # self.scheme = json.load(file) 19 | # self.config = opened_config 20 | # self.dataset = self.config.get("dataset") 21 | # self.experiment = self.config.get("experiment") 22 | # self.report = self.config.get("report") 23 | # self.validate_config() 24 | # self._parse_config() 25 | # 26 | # def validate_config(self): 27 | # validate(self.config, self.scheme) 28 | # if ( 29 | # "data" in self.dataset.keys() 30 | # and "path" not in self.dataset.keys() 31 | # and not self.dataset["data"]["data"] 32 | # ): 33 | # raise ValueError("Data or path to data must be added") 34 | # # if len(self.dataset["roles"]["role_names"]) != len( 35 | # # self.dataset["roles"]["columns"] 36 | # # ): 37 | # # raise ValueError( 38 | # # f"Invalid number of columns and role_names. Columns and role_names must have equal length.\n " 39 | # # f"role_names contains {len(self.dataset['roles']['role_names'])} values and columns contains {len(self.dataset['roles']['columns'])}" 40 | # # ) 41 | # 42 | # def _parse_config(self): 43 | # self.dataset = self._parse_dataset() 44 | # 45 | # def _parse_dataset(self): 46 | # data = ( 47 | # self.dataset["data"] 48 | # if "data" in self.dataset.keys() 49 | # else self.dataset["path"] 50 | # ) 51 | # roles = {} 52 | # for column in self.dataset["columns"]: 53 | # role = default_roles.get(column["role"].lower(), InfoRole()) 54 | # role.data_type = column["dataType"] if column.get("dataType") else None 55 | # roles.update({column["name"]: role}) 56 | # return Dataset(data=data, roles=roles, backend=self.dataset["backend"]) 57 | # 58 | # def to_json(self, file: Optional[str] = None): 59 | # # return json.dumps(self.dataset.to_json(), self.experiment.to_json(), self.report.to_json()) 60 | # if file: 61 | # with open(file, "w") as f: 62 | # json.dump( 63 | # {"dataset": self.dataset.to_dict(), "experiment": {}, "report": {}}, 64 | # f, 65 | # indent=4, 66 | # ) 67 | # return json.dumps( 68 | # {"dataset": self.dataset.to_dict(), "experiment": {}, "report": {}} 69 | # ) 70 | # 71 | # def execute(self): 72 | # experiment_data, self.experiment = Factory(self).execute() 73 | # return experiment_data, self.experiment 74 | -------------------------------------------------------------------------------- /hypex/reporters/matching.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, ClassVar 4 | 5 | from ..analyzers.matching import MatchingAnalyzer 6 | from ..comparators import KSTest, TTest 7 | from ..dataset import Dataset, ExperimentData 8 | from ..ml import FaissNearestNeighbors 9 | from ..reporters.abstract import DatasetReporter, DictReporter, TestDictReporter 10 | from ..utils import ( 11 | ID_SPLIT_SYMBOL, 12 | MATCHING_INDEXES_SPLITTER_SYMBOL, 13 | ExperimentDataEnum, 14 | ) 15 | 16 | 17 | class MatchingDictReporter(DictReporter): 18 | def __init__(self, searching_class: type = MatchingAnalyzer): 19 | self.searching_class = searching_class 20 | super().__init__() 21 | 22 | @staticmethod 23 | def _convert_dataset_to_dict(data: Dataset) -> dict[str, Any]: 24 | dict_data = data.to_dict()["data"] 25 | indexes = dict_data["index"] 26 | df = dict_data["data"] 27 | result = {} 28 | for key, values in df.items(): 29 | for index, value in zip(indexes, values): 30 | result[f"{key}{ID_SPLIT_SYMBOL}{index}"] = value 31 | return result 32 | 33 | def _extract_from_analyser(self, data: ExperimentData): 34 | analyzer_id = data.get_one_id( 35 | self.searching_class, ExperimentDataEnum.analysis_tables 36 | ) 37 | return self._convert_dataset_to_dict(data.analysis_tables[analyzer_id]) 38 | 39 | @staticmethod 40 | def _extract_from_additional_fields(data: ExperimentData): 41 | indexes_id = data.get_one_id( 42 | FaissNearestNeighbors, ExperimentDataEnum.additional_fields 43 | ) 44 | return { 45 | "indexes": MATCHING_INDEXES_SPLITTER_SYMBOL.join( 46 | str(i) 47 | for i in data.additional_fields[indexes_id].to_dict()["data"]["data"][ 48 | indexes_id 49 | ] 50 | ) 51 | } 52 | 53 | def report(self, experiment_data: ExperimentData): 54 | result = {} 55 | result.update(self._extract_from_analyser(experiment_data)) 56 | if self.searching_class == MatchingAnalyzer: 57 | result.update(self._extract_from_additional_fields(experiment_data)) 58 | return result 59 | 60 | 61 | class MatchingQualityDictReporter(TestDictReporter): 62 | tests: ClassVar[list] = [TTest, KSTest] 63 | 64 | def report(self, data: ExperimentData) -> dict[str, Any]: 65 | return self.extract_tests(data) 66 | 67 | 68 | class MatchingQualityDatasetReporter(MatchingQualityDictReporter): 69 | @classmethod 70 | def convert_flat_dataset(cls, data: dict) -> Dataset: 71 | struct_dict = cls._get_struct_dict(data) 72 | return cls._convert_struct_dict_to_dataset(struct_dict) 73 | 74 | def report(self, data: ExperimentData): 75 | front_buffer = self.front 76 | self.front = False 77 | dict_report = super().report(data) 78 | self.front = front_buffer 79 | return self.convert_flat_dataset(dict_report) 80 | 81 | 82 | class MatchingDatasetReporter(DatasetReporter): 83 | def __init__(self, searching_class: type = MatchingAnalyzer) -> None: 84 | self.dict_reporter = MatchingDictReporter(searching_class) 85 | super().__init__(self.dict_reporter) 86 | -------------------------------------------------------------------------------- /hypex/utils/decorator.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from functools import wraps 4 | from typing import Any, Callable, cast 5 | 6 | from hypex.utils import DecoratedType, DocstringInheritDecorator 7 | 8 | 9 | def inherit_docstring_from( 10 | source: Callable[..., Any] | property, 11 | ) -> DocstringInheritDecorator: 12 | """A decorator to inherit the docstring from another function or property. 13 | 14 | This decorator can be applied to both callable objects and properties. It copies the docstring 15 | from the source object to the decorated object if the latter does not already have a docstring. 16 | 17 | Args: 18 | source: The object from which the docstring will be inherited. 19 | This should be either a callable or a property that has 20 | a well-defined __doc__ attribute. 21 | 22 | Returns: 23 | A decorator that when applied to a function or property, 24 | sets its __doc__ attribute to that of the source. 25 | 26 | Raises: 27 | TypeError: If the object to be decorated is neither a callable nor a property. 28 | 29 | Example: 30 | Using with property:: 31 | 32 | class SomeClass: 33 | @property 34 | @inherit_docstring_from(pd.DataFrame.iloc) 35 | def iloc(self): 36 | return self._data.iloc 37 | 38 | Using with method:: 39 | 40 | @inherit_docstring_from(pd.DataFrame.mean) 41 | def mean(self): 42 | return self._data.mean() 43 | """ 44 | 45 | def decorator(obj: DecoratedType) -> DecoratedType: 46 | """ 47 | Apply the inherited docstring to a given function or property. 48 | This function acts as a decorator within 'inherit_docstring_from', applying the docstring 49 | from the 'source' object to the 'obj'. If 'obj' is a property, it modifies the property to include 50 | the source's docstring. If 'obj' is a callable, it wraps the callable in a function that preserves 51 | the original callable's functionality and metadata but updates the docstring. 52 | Args: 53 | obj (DecoratedType): The function or property to which the docstring will be applied. 54 | It must be either a callable or a property. 55 | Returns: 56 | DecoratedType: The original object with the updated docstring. If the object is a property, 57 | it returns a new property object with the inherited docstring. If it's a callable, 58 | it returns the wrapped callable with the updated docstring. 59 | Raises: 60 | TypeError: If 'obj' is neither a callable nor a property. 61 | """ 62 | if isinstance(obj, property): 63 | doc = getattr(source, "__doc__", "No documentation provided.") 64 | return property(obj.fget, obj.fset, obj.fdel, doc) 65 | elif callable(obj): 66 | 67 | @wraps(obj) 68 | def wrapper(*args, **kwargs) -> Any: 69 | return obj(*args, **kwargs) 70 | 71 | wrapper.__doc__ = getattr(source, "__doc__", "No documentation provided.") 72 | return cast(DecoratedType, wrapper) 73 | else: 74 | raise TypeError( 75 | "The decorator can only be applied to callables or properties." 76 | ) 77 | 78 | return decorator 79 | -------------------------------------------------------------------------------- /schemes/architecture_levels.md: -------------------------------------------------------------------------------- 1 | # Уровни абстракции HypEx 2 | Разделение на уровни абстракции необходимы для того, чтобы упростить работу с проектом для разных категорий пользователей. 3 | 4 | ## Уровень 1. Пользовательский интерфейс на платформе. 5 | **Пользователь** - бизнес пользователь, который хочет получить результаты анализа данных. 6 | 7 | **Сегмент** - платформа 8 | 9 | **Идея** - предоставить пользователю возможность работать с проектом без специальных знаний по статистике и написания кода. В идеале, пользователь просто выбирает сценарий под свой проект, разработанный финансистами, и запускает его. 10 | 11 | **Использование** - запуск сценария в интерфейсе на платформе. 12 | 13 | ## Уровень 2. Создание сценариев в конструкторе на платформе. 14 | **Пользователь** - финансист, который хочет создать сценарий для решения задачи бизнеса. 15 | 16 | **Сегмент** - платформа 17 | 18 | **Идея** - предоставить пользователю со знанием статистики и пониманием бизнес процессов возможность создания сценариев в удобном конструкторе без необходимости написания кода. 19 | 20 | **Использование** - создание сценариев в конструкторе на платформе с помощью графического интерфейса. 21 | 22 | ## Уровень 3. Настраиваемый в шаблонном коде сценарий. 23 | **Пользователь** - бизнес пользователь с доступом к данным и лабораторной зоне для запуска сценария. 24 | 25 | **Сегмент** - Dev пакет 26 | 27 | **Идея** - предоставить пользователю возможность запускать заранее запрограммированный сценарий с настраиваемыми параметрами. 28 | 29 | **Использование** - запуск шаблонного кода с настраиваемыми параметрами в лабораторной зоне. 30 | 31 | ## Уровень 4. Использование оболочки эксперимента HypEx. 32 | **Пользователь** - исследователь данных, изучивший туториал или документацию HypEx 33 | 34 | **Сегмент** - библиотека 35 | 36 | **Идея** - предоставить пользователю возможность настраивать и запускать базовые эксперименты в пару строчек кода. 37 | 38 | **Использование** - настраивание и запуск эксперимента в пару строчек кода с использованием оболочки эксперимента HypEx. 39 | 40 | 41 | ## Уровень 5. Создание эксперимента в коде. 42 | **Пользователь** - разработчик, знакомый с базовыми блоками библиотеки HypEx (Executor). 43 | 44 | **Сегмент** - библиотека 45 | 46 | **Идея** - предоставить пользователю возможность создавать кастомные эксперименты в коде из базовых блоков библиотеки HypEx. 47 | 48 | **Использование** - создание эксперимента в коде с использованием базовых блоков библиотеки HypEx. 49 | 50 | ## Уровень 6. Создание базового блока (Executor) насследованием от типового блока. 51 | **Пользователь** - разработчик, знакомый с типовыми блоками библиотеки HypEx. 52 | **Сегмент** - библиотека 53 | **Идея** - предоставить пользователю возможность создавать кастомные базовые блоки (Executor) наследованием от типовых блоков библиотеки HypEx. 54 | **Использование** - создание кастомного базового блока (Executor) наследованием от типового блока библиотеки HypEx. 55 | 56 | ## Уровень 7. Модификация библиотеки. 57 | **Пользователь** - хорошо знакомый с тем, как устроени и работает библиотека HypEx разработчик. 58 | 59 | **Сегмент** - библиотека 60 | 61 | **Идея** - глубокие доработки базовых механик библиотеки HypEx. 62 | 63 | **Использование** - глубокие доработки базовых механик библиотеки HypEx. 64 | 65 | ## Уровень 8. Ядро 66 | **Пользователь** - архитектор HypEx 67 | 68 | **Сегмент** - библиотека 69 | 70 | **Идея** - изменение фундаментального поведения библиотеки HypEx. Обычно означает нового поколения архитектуры. 71 | 72 | **Использование** - создание ядра библиотеки HypEx -------------------------------------------------------------------------------- /hypex/operators/abstract.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import abstractmethod 4 | from typing import Any, Sequence 5 | 6 | from ..dataset import ( 7 | ABCRole, 8 | AdditionalTargetRole, 9 | Dataset, 10 | ExperimentData, 11 | GroupingRole, 12 | TargetRole, 13 | ) 14 | from ..executor import Calculator 15 | from ..utils import AbstractMethodError, ExperimentDataEnum, NotSuitableFieldError 16 | from ..utils.adapter import Adapter 17 | 18 | 19 | class GroupOperator( 20 | Calculator 21 | ): # TODO: change the derive from Calculator to COmparator 22 | def __init__( 23 | self, 24 | grouping_role: ABCRole | None = None, 25 | target_roles: ABCRole | list[ABCRole] | None = None, 26 | key: Any = "", 27 | ): 28 | super().__init__(key=key) 29 | self.target_roles = target_roles or TargetRole() 30 | self.grouping_role = grouping_role or GroupingRole() 31 | 32 | @property 33 | def search_types(self): 34 | return None 35 | 36 | @classmethod 37 | @abstractmethod 38 | def _inner_function( 39 | cls, data: Dataset, test_data: Dataset | None = None, **kwargs 40 | ) -> Any: 41 | raise AbstractMethodError 42 | 43 | def _get_fields(self, data: ExperimentData): 44 | group_field = data.field_search(self.grouping_role) 45 | target_fields = data.field_search( 46 | self.target_roles, search_types=self.search_types 47 | ) 48 | if len(target_fields) != 2: 49 | target_fields += data.field_search( 50 | AdditionalTargetRole(), search_types=self.search_types 51 | ) 52 | return group_field, target_fields 53 | 54 | @classmethod 55 | def _execute_inner_function( 56 | cls, 57 | grouping_data, 58 | target_fields: list[str] | None = None, 59 | **kwargs, 60 | ) -> dict: 61 | if target_fields is None or len(target_fields) != 2: 62 | raise ValueError( 63 | f"This operator works with 2 targets, but got {len(target_fields) if target_fields else None}" 64 | ) 65 | result = {} 66 | for group, group_data in grouping_data: 67 | result[group[0]] = cls._inner_function( 68 | data=group_data[target_fields[0]], 69 | test_data=group_data[target_fields[1]], 70 | **kwargs, 71 | ) 72 | return result 73 | 74 | @classmethod 75 | def calc( 76 | cls, 77 | data: Dataset, 78 | group_field: Sequence[str] | str | None = None, 79 | grouping_data: list[tuple[str, Dataset]] | None = None, 80 | target_fields: str | list[str] | None = None, 81 | **kwargs, 82 | ) -> dict: 83 | group_field = Adapter.to_list(group_field) 84 | 85 | if grouping_data is None: 86 | grouping_data = data.groupby(group_field) 87 | if len(grouping_data) > 1: 88 | grouping_data[0][1].tmp_roles = data.tmp_roles 89 | else: 90 | raise NotSuitableFieldError(group_field, "Grouping") 91 | return cls._execute_inner_function( 92 | grouping_data, target_fields=target_fields, old_data=data, **kwargs 93 | ) 94 | 95 | def _set_value( 96 | self, data: ExperimentData, value: dict | None = None, key: Any = None 97 | ) -> ExperimentData: 98 | data.set_value( 99 | ExperimentDataEnum.variables, 100 | self.id, 101 | value, 102 | ) 103 | return data 104 | -------------------------------------------------------------------------------- /examples/experiments/performance_test/config.schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-07/schema#", 3 | "title": "Parameters Configuration", 4 | "description": "Schema for onefactor, montecarlo and fixed parameters", 5 | "type": "object", 6 | "properties": { 7 | "onefactor_params": { 8 | "type": "object", 9 | "properties": { 10 | "n_rows": { 11 | "type": "array", 12 | "items": { 13 | "type": "integer", 14 | "minimum": 1 15 | } 16 | }, 17 | "n_columns": { 18 | "type": "array", 19 | "items": { 20 | "type": "integer", 21 | "minimum": 1 22 | } 23 | }, 24 | "n_iterations": { 25 | "type": "array", 26 | "items": { 27 | "type": "integer", 28 | "minimum": 1 29 | } 30 | } 31 | }, 32 | "additionalProperties": true 33 | }, 34 | "montecarlo_params": { 35 | "type": "object", 36 | "properties": { 37 | "num_points": { 38 | "type": "integer", 39 | "minimum": 1 40 | }, 41 | "bounds": { 42 | "type": "object", 43 | "properties": { 44 | "n_rows": { 45 | "type": "object", 46 | "properties": { 47 | "max": { 48 | "type": "integer", 49 | "minimum": 1 50 | }, 51 | "min": { 52 | "type": "integer", 53 | "minimum": 1 54 | } 55 | }, 56 | "required": ["max", "min"], 57 | "additionalProperties": false 58 | }, 59 | "n_iterations": { 60 | "type": "object", 61 | "properties": { 62 | "max": { 63 | "type": "integer", 64 | "minimum": 1 65 | }, 66 | "min": { 67 | "type": "integer", 68 | "minimum": 1 69 | } 70 | }, 71 | "required": ["max", "min"], 72 | "additionalProperties": false 73 | }, 74 | "n_columns": { 75 | "type": "object", 76 | "properties": { 77 | "max": { 78 | "type": "integer", 79 | "minimum": 1 80 | }, 81 | "min": { 82 | "type": "integer", 83 | "minimum": 1 84 | } 85 | }, 86 | "required": ["max", "min"], 87 | "additionalProperties": false 88 | } 89 | }, 90 | "additionalProperties": true 91 | } 92 | }, 93 | "required": ["num_points", "bounds"], 94 | "additionalProperties": true 95 | }, 96 | "fixed_params": { 97 | "type": "object", 98 | "properties": { 99 | "n_columns": { 100 | "type": "integer", 101 | "minimum": 1 102 | }, 103 | "n_rows": { 104 | "type": "integer", 105 | "minimum": 1 106 | }, 107 | "n_iterations": { 108 | "type": "integer", 109 | "minimum": 1 110 | } 111 | }, 112 | "additionalProperties": true 113 | } 114 | }, 115 | "additionalProperties": true 116 | } -------------------------------------------------------------------------------- /hypex/comparators/comparators.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Literal 4 | 5 | import numpy as np 6 | 7 | from ..dataset import ABCRole, Dataset 8 | from ..utils.constants import NUMBER_TYPES_LIST 9 | from .abstract import Comparator 10 | 11 | NUM_OF_BUCKETS = 10 12 | 13 | 14 | class GroupDifference(Comparator): 15 | def __init__( 16 | self, 17 | compare_by: Literal[ 18 | "groups", "columns", "columns_in_groups", "cross", "matched_pairs" 19 | ] = "groups", 20 | grouping_role: ABCRole | None = None, 21 | target_roles: ABCRole | list[ABCRole] | None = None, 22 | ): 23 | super().__init__( 24 | compare_by=compare_by, 25 | grouping_role=grouping_role, 26 | target_roles=target_roles, 27 | ) 28 | 29 | @property 30 | def search_types(self) -> list[type] | None: 31 | return NUMBER_TYPES_LIST 32 | 33 | @classmethod 34 | def _inner_function( 35 | cls, 36 | data: Dataset, 37 | test_data: Dataset | None = None, 38 | **kwargs, 39 | ) -> dict: 40 | test_data = cls._check_test_data(test_data) 41 | control_mean = data.mean() 42 | test_mean = test_data.mean() 43 | 44 | return { 45 | "control mean": control_mean, 46 | "test mean": test_mean, 47 | "difference": test_mean - control_mean, 48 | "difference %": ( 49 | (test_mean / control_mean - 1) * 100 if control_mean != 0 else None 50 | ), 51 | } 52 | 53 | 54 | class GroupSizes(Comparator): 55 | def __init__( 56 | self, 57 | compare_by: Literal[ 58 | "groups", "columns", "columns_in_groups", "cross", "matched_pairs" 59 | ] = "groups", 60 | grouping_role: ABCRole | None = None, 61 | ): 62 | super().__init__( 63 | compare_by=compare_by, 64 | grouping_role=grouping_role, 65 | target_roles=grouping_role, 66 | ) 67 | 68 | @classmethod 69 | def _inner_function( 70 | cls, data: Dataset, test_data: Dataset | None = None, **kwargs 71 | ) -> dict: 72 | size_a = len(data) 73 | size_b = len(test_data) if isinstance(test_data, Dataset) else 0 74 | 75 | return { 76 | "control size": size_a, 77 | "test size": size_b, 78 | "control size %": (size_a / (size_a + size_b)) * 100, 79 | "test size %": (size_b / (size_a + size_b)) * 100, 80 | } 81 | 82 | 83 | class PSI(Comparator): 84 | @classmethod 85 | def _inner_function( 86 | cls, data: Dataset, test_data: Dataset | None = None, **kwargs 87 | ) -> dict[str, float]: 88 | test_data = cls._check_test_data(test_data=test_data) 89 | data.sort(ascending=False) 90 | test_data.sort(ascending=False) 91 | data_column = data.iloc[:, 0] 92 | test_data_column = test_data.iloc[:, 0] 93 | data_bins = np.arange( 94 | data_column.min(), 95 | data_column.max(), 96 | (data_column.max() - data_column.min()) / NUM_OF_BUCKETS, 97 | ) 98 | test_data_bins = np.arange( 99 | test_data_column.min(), 100 | test_data_column.max(), 101 | (test_data_column.max() - test_data_column.min()) / NUM_OF_BUCKETS, 102 | ) 103 | data_groups = data_column.groupby( 104 | data_column.cut(data_bins).get_values(column=data.columns[0]) 105 | ) 106 | test_data_groups = test_data_column.groupby( 107 | test_data_column.cut(test_data_bins).get_values(column=test_data.columns[0]) 108 | ) 109 | 110 | data_psi = [x[1].count() / len(data) for x in data_groups] 111 | test_data_psi = [x[1].count() / len(test_data) for x in test_data_groups] 112 | psi = [(y - x) * np.log(y / x) for x, y in zip(data_psi, test_data_psi)] 113 | return {"PSI": sum(psi)} 114 | -------------------------------------------------------------------------------- /hypex/ui/matching.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any 4 | 5 | from ..analyzers.matching import MatchingAnalyzer 6 | from ..dataset import ( 7 | AdditionalMatchingRole, 8 | Dataset, 9 | ExperimentData, 10 | GroupingRole, 11 | StatisticRole, 12 | TargetRole, 13 | ) 14 | from ..reporters.matching import MatchingDictReporter, MatchingQualityDatasetReporter 15 | from ..utils import ID_SPLIT_SYMBOL, MATCHING_INDEXES_SPLITTER_SYMBOL 16 | from .base import Output 17 | 18 | 19 | class MatchingOutput(Output): 20 | resume: Dataset 21 | full_data: Dataset 22 | quality_results: Dataset 23 | 24 | def __init__(self, searching_class: type = MatchingAnalyzer): 25 | super().__init__( 26 | resume_reporter=MatchingDictReporter(searching_class), 27 | additional_reporters=MatchingQualityDatasetReporter(), 28 | ) 29 | 30 | def _extract_full_data(self, experiment_data: ExperimentData, indexes: Dataset): 31 | indexes.index = experiment_data.ds.index 32 | filtered_field = indexes.drop( 33 | indexes[indexes[indexes.columns[0]] == -1], axis=0 34 | ) 35 | matched_data = experiment_data.ds.loc[ 36 | list(map(lambda x: x[0], filtered_field.get_values())) 37 | ].rename({i: i + "_matched" for i in experiment_data.ds.columns}) 38 | matched_data.index = filtered_field.index 39 | self.indexes = indexes 40 | self.full_data = experiment_data.ds.append( 41 | matched_data.reindex(experiment_data.ds.index), axis=1 42 | ) 43 | 44 | def extract(self, experiment_data: ExperimentData): 45 | resume = self.resume_reporter.report(experiment_data) 46 | reformatted_resume: dict[str, Any] = {} 47 | for key, value in resume.items(): 48 | if ID_SPLIT_SYMBOL in key: 49 | keys = key.split(ID_SPLIT_SYMBOL) 50 | temp_key = keys[0] if len(keys) < 3 else f"{keys[2]} {keys[0]}" 51 | if temp_key not in reformatted_resume: 52 | reformatted_resume[temp_key] = {} 53 | reformatted_resume[temp_key].update({keys[1]: value}) 54 | if "indexes" in reformatted_resume.keys(): 55 | group_indexes_id = experiment_data.ds.search_columns(GroupingRole()) 56 | indexes = [ 57 | Dataset.from_dict( 58 | { 59 | "indexes": list( 60 | map(int, values.split(MATCHING_INDEXES_SPLITTER_SYMBOL)) 61 | ) 62 | }, 63 | index=experiment_data.ds[ 64 | experiment_data.ds[group_indexes_id] == group 65 | ].index, 66 | roles={"indexes": StatisticRole()}, 67 | ) 68 | for group, values in reformatted_resume.pop("indexes").items() 69 | ] 70 | indexes = indexes[0].append(indexes[1:]).sort() 71 | else: 72 | indexes = Dataset.from_dict( 73 | { 74 | "indexes": list( 75 | map( 76 | int, 77 | resume["indexes"].split(MATCHING_INDEXES_SPLITTER_SYMBOL), 78 | ) 79 | ) 80 | }, 81 | roles={"indexes": AdditionalMatchingRole()}, 82 | ) 83 | 84 | outcome = experiment_data.field_search(TargetRole())[0] 85 | reformatted_resume["outcome"] = { 86 | key: outcome 87 | for key in reformatted_resume[next(iter(reformatted_resume.keys()))].keys() 88 | } 89 | 90 | self.resume = Dataset.from_dict( 91 | reformatted_resume, 92 | roles={ 93 | column: StatisticRole() for column in list(reformatted_resume.keys()) 94 | }, 95 | ) 96 | self._extract_full_data( 97 | experiment_data, 98 | indexes, 99 | ) 100 | self.resume = round(self.resume, 2) 101 | 102 | self.quality_results = self.additional_reporters.report(experiment_data) 103 | -------------------------------------------------------------------------------- /hypex/dataset/roles.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC 4 | 5 | from ..utils import CategoricalTypes, DefaultRoleTypes, RoleNameType, TargetRoleTypes 6 | 7 | 8 | class ABCRole(ABC): 9 | _role_name: RoleNameType = "Abstract" 10 | 11 | def __init__(self, data_type: DefaultRoleTypes | None = None): 12 | self.data_type = data_type 13 | 14 | @property 15 | def role_name(self) -> str: 16 | return self._role_name 17 | 18 | def __repr__(self) -> str: 19 | return f"{self._role_name}({self.data_type})" 20 | 21 | 22 | class InfoRole(ABCRole): 23 | _role_name: RoleNameType = "Info" 24 | 25 | 26 | class StratificationRole(ABCRole): 27 | _role_name: RoleNameType = "Stratification" 28 | 29 | def __init__(self, data_type: CategoricalTypes | None = None): 30 | super().__init__(data_type) 31 | 32 | 33 | class GroupingRole(ABCRole): 34 | _role_name: RoleNameType = "Grouping" 35 | 36 | def __init__(self, data_type: CategoricalTypes | None = None): 37 | super().__init__(data_type) 38 | 39 | 40 | class TreatmentRole(ABCRole): 41 | _role_name: RoleNameType = "Treatment" 42 | 43 | 44 | class TargetRole(ABCRole): 45 | _role_name: RoleNameType = "Target" 46 | 47 | def __init__(self, data_type: TargetRoleTypes | None = None): 48 | super().__init__(data_type) 49 | 50 | 51 | class FeatureRole(ABCRole): 52 | _role_name: RoleNameType = "Feature" 53 | 54 | 55 | class PreTargetRole(ABCRole): 56 | _role_name: RoleNameType = "PreTarget" 57 | 58 | def __init__(self, data_type: TargetRoleTypes | None = None): 59 | super().__init__(data_type) 60 | 61 | 62 | class StatisticRole(ABCRole): 63 | _role_name: RoleNameType = "Statistic" 64 | 65 | 66 | class ResumeRole(ABCRole): 67 | _role_name = "Resume" 68 | 69 | 70 | class FilterRole(ABCRole): 71 | _role_name: RoleNameType = "Filter" 72 | 73 | 74 | class ConstGroupRole(ABCRole): 75 | _role_name: RoleNameType = "ConstGroup" 76 | 77 | 78 | # ___________________________________________________________________________________________ 79 | class TempRole(ABCRole): 80 | _role_name: RoleNameType = "Temp" 81 | 82 | 83 | class TempTreatmentRole(TempRole, TreatmentRole): 84 | _role_name: RoleNameType = "TempTreatment" 85 | 86 | 87 | class TempTargetRole(TempRole, TargetRole): 88 | _role_name: RoleNameType = "TempTarget" 89 | 90 | 91 | class TempGroupingRole(TempRole, GroupingRole): 92 | _role_name: RoleNameType = "TempGrouping" 93 | 94 | 95 | class DefaultRole(ABCRole): 96 | _role_name: RoleNameType = "Default" 97 | 98 | 99 | class ReportRole(ABCRole): 100 | _role_name: RoleNameType = "Report" 101 | 102 | 103 | # ___________________________________________________________________________________________ 104 | class AdditionalRole(ABCRole): 105 | _role_name: RoleNameType = "Additional" 106 | 107 | 108 | class AdditionalTreatmentRole(AdditionalRole): 109 | _role_name: RoleNameType = "AdditionalTreatment" 110 | 111 | 112 | class AdditionalGroupingRole(AdditionalRole): 113 | _role_name: RoleNameType = "AdditionalGrouping" 114 | 115 | 116 | class AdditionalTargetRole(AdditionalRole): 117 | _role_name: RoleNameType = "AdditionalTarget" 118 | 119 | 120 | class AdditionalPreTargetRole(AdditionalRole): 121 | _role_name: RoleNameType = "AdditionalPreTarget" 122 | 123 | 124 | class AdditionalMatchingRole(AdditionalRole): 125 | _role_name: RoleNameType = "AdditionalMatching" 126 | 127 | 128 | default_roles: dict[RoleNameType, ABCRole] = { 129 | "info": InfoRole(), 130 | "default": DefaultRole(), 131 | "feature": FeatureRole(), 132 | "treatment": TreatmentRole(), 133 | "grouping": GroupingRole(), 134 | "target": TargetRole(), 135 | "pretarget": PreTargetRole(), 136 | "stratification": StratificationRole(), 137 | "statistic": StatisticRole(), 138 | "filter": FilterRole(), 139 | "constgroup": ConstGroupRole(), 140 | "additionaltreatment": AdditionalTreatmentRole(), 141 | "additionalgrouping": AdditionalGroupingRole(), 142 | "additionaltarget": AdditionalTargetRole(), 143 | "additionalpretarget": AdditionalPreTargetRole(), 144 | } 145 | -------------------------------------------------------------------------------- /hypex/comparators/power_testing.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import Any 5 | 6 | import numpy as np 7 | from scipy.stats import norm 8 | 9 | from ..dataset import ABCRole, Dataset, ExperimentData 10 | from ..utils import ExperimentDataEnum 11 | from .comparators import Comparator 12 | 13 | 14 | class PowerTesting(Comparator, ABC): 15 | def __init__( 16 | self, 17 | grouping_role: ABCRole | None = None, 18 | # space: SpaceEnum = SpaceEnum.auto, 19 | significance: float = 0.95, 20 | power: float = 0.8, 21 | key: Any = "", 22 | ): 23 | super().__init__( 24 | compare_by="groups", 25 | grouping_role=grouping_role, 26 | # space=space, 27 | key=key, 28 | ) 29 | self.significance = significance 30 | self.power = power 31 | 32 | @classmethod 33 | @abstractmethod 34 | def _inner_function( 35 | cls, 36 | data: Dataset, 37 | test_data: Dataset | None = None, 38 | significance: float = 0.95, 39 | power: float = 0.8, 40 | **kwargs, 41 | ) -> float: 42 | pass 43 | 44 | def execute(self, data: ExperimentData) -> ExperimentData: 45 | return super().execute(data) 46 | 47 | 48 | class MDEBySize(PowerTesting): 49 | def _set_value( 50 | self, data: ExperimentData, value: Dataset | None = None, key: Any = None 51 | ) -> ExperimentData: 52 | data.set_value( 53 | ExperimentDataEnum.variables, 54 | self.id, 55 | value, 56 | ) 57 | return data 58 | 59 | @classmethod 60 | def _inner_function( 61 | cls, 62 | data: Dataset, 63 | test_data: Dataset | None = None, 64 | significance: float = 0.95, 65 | power: float = 0.8, 66 | **kwargs, 67 | ) -> float: 68 | m = norm.ppf((1 + significance) / 2) + norm.ppf(power) 69 | if not test_data: 70 | raise ValueError("test_data is required") 71 | 72 | n_test, n_control = len(test_data), len(data) 73 | 74 | var_test, var_control = test_data.var(ddof=1), data.var(ddof=1) 75 | s = np.sqrt(var_test / n_test + var_control / n_control) 76 | 77 | return m * s 78 | 79 | 80 | # 81 | # 82 | # class StatPowerByTTestInd(TestPower): 83 | # 84 | # def _inner_function(self, control_data, test_data) -> ExperimentData: 85 | # control_size = len(control_data) 86 | # test_size = len(test_data) 87 | # 88 | # analysis = TTestIndPower() 89 | # ratio = test_size / control_size 90 | # return analysis.power( 91 | # effect_size=effect_size, 92 | # nobs1=test_size, 93 | # ratio=ratio, 94 | # alpha=significance, 95 | # 96 | 97 | 98 | # class MDEBySize(GroupComparator): 99 | # def __init__( 100 | # self, 101 | # grouping_role: Optional[ABCRole] = None, 102 | # space: SpaceEnum = SpaceEnum.auto, 103 | # full_name: Optional[str] = None, 104 | # key: Any = "", 105 | # power: float = 0.8, 106 | # significance: float = 0.95, 107 | # ): 108 | # super().__init__(grouping_role, space, full_name, key) 109 | # self.power = power 110 | # self.significance = significance 111 | # 112 | # @staticmethod 113 | # def _inner_function( 114 | # control_data, test_data, significance=0.95, power=0.8, **kwargs 115 | # ) -> Dict[str, Any]: 116 | # result = {} 117 | # m = norm.ppf(1 - significance / 2) - norm.ppf(power) 118 | # n_control, n_test = len(control_data), len(test_data) 119 | # proportion = n_test / (n_test + n_control) 120 | # p = np.sqrt(1 / (proportion * (1 - proportion))) 121 | # for target in control_data.columns: 122 | # var_control = control_data[target].var() 123 | # var_test = test_data[target].var() 124 | # s = np.sqrt(var_test / n_test + var_control / n_control) 125 | # result[target] = p * m * s 126 | # 127 | # return result 128 | # 129 | # @staticmethod 130 | # def calc( 131 | # cls: Dataset, 132 | # data: Union[Sequence[str], str, None], 133 | # group_field: Optional[str] = None, 134 | # grouping_data=None, 135 | # target_fields=None, 136 | # **kwargs 137 | # ): 138 | # return GroupComparator.calc( 139 | # data=data, 140 | # group_field=group_field, 141 | # target_fields=target_fields, 142 | # comparison_function=MDEBySize._inner_function, 143 | # power=power, 144 | # significance=target_fields, 145 | # ) 146 | # 147 | # def execute(self, data: ExperimentData) -> ExperimentData: 148 | # subdata = data.ds.loc[ 149 | # :, data.ds.get_columns_by_roles([TargetRole(), self.grouping_role]) 150 | # ] 151 | # ed = super().execute(ExperimentData(subdata)) 152 | # return self._set_value(data, ed.analysis_tables[self._id]) 153 | -------------------------------------------------------------------------------- /hypex/comparators/distances.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from copy import deepcopy 4 | from typing import Any, Sequence 5 | 6 | from ..dataset import ( 7 | ABCRole, 8 | Dataset, 9 | ExperimentData, 10 | FeatureRole, 11 | GroupingRole, 12 | TargetRole, 13 | ) 14 | from ..executor import Calculator 15 | from ..extensions.scipy_linalg import CholeskyExtension, InverseExtension 16 | from ..utils import ExperimentDataEnum, NotSuitableFieldError 17 | from ..utils.adapter import Adapter 18 | 19 | 20 | class MahalanobisDistance(Calculator): 21 | def __init__( 22 | self, 23 | grouping_role: ABCRole | None = None, 24 | key: Any = "", 25 | ): 26 | super().__init__(key=key) 27 | self.grouping_role = grouping_role or GroupingRole() 28 | 29 | @classmethod 30 | def _execute_inner_function( 31 | cls, 32 | grouping_data, 33 | target_fields: list[str] | None = None, 34 | **kwargs, 35 | ) -> dict: 36 | result = {} 37 | for i in range(1, len(grouping_data)): 38 | result.update( 39 | cls._inner_function( 40 | data=( 41 | grouping_data[0][1][target_fields] 42 | if target_fields 43 | else grouping_data[0][1] 44 | ), 45 | test_data=( 46 | grouping_data[i][1][target_fields] 47 | if target_fields 48 | else grouping_data[i][1] 49 | ), 50 | **kwargs, 51 | ) 52 | ) 53 | return result 54 | 55 | def _set_value( 56 | self, data: ExperimentData, value: dict | None = None, key: Any = None 57 | ) -> ExperimentData: 58 | for key, value_ in value.items(): 59 | data = data.set_value( 60 | ExperimentDataEnum.groups, 61 | self.id, 62 | value_, 63 | key=key, 64 | ) 65 | return data 66 | 67 | def _get_fields(self, data: ExperimentData): 68 | group_field = data.field_search(self.grouping_role) 69 | target_fields = data.field_search(FeatureRole(), search_types=self.search_types) 70 | return group_field, target_fields 71 | 72 | @property 73 | def search_types(self) -> list[type] | None: 74 | return [int, float] 75 | 76 | @classmethod 77 | def _inner_function(cls, data: Dataset, test_data: Dataset | None = None, **kwargs): 78 | test_data = cls._check_test_data(test_data) 79 | cov = (data.cov() + test_data.cov()) / 2 if test_data else data.cov() 80 | cholesky = CholeskyExtension().calc(cov) 81 | mahalanobis_transform = InverseExtension().calc(cholesky) 82 | y_control = data.dot(mahalanobis_transform.transpose()) 83 | if test_data: 84 | y_test = test_data.dot(mahalanobis_transform.transpose()) 85 | return {"control": y_control, "test": y_test} 86 | return {"control": y_control} 87 | 88 | @classmethod 89 | def calc( 90 | cls, 91 | data: Dataset, 92 | group_field: Sequence[str] | str | None = None, 93 | grouping_data: list[tuple[str, Dataset]] | None = None, 94 | target_fields: str | list[str] | None = None, 95 | **kwargs, 96 | ) -> dict: 97 | group_field = Adapter.to_list(group_field) 98 | 99 | if grouping_data is None: 100 | grouping_data = data.groupby(group_field) 101 | if len(grouping_data) > 1: 102 | grouping_data[0][1].tmp_roles = data.tmp_roles 103 | else: 104 | raise NotSuitableFieldError(group_field, "Grouping") 105 | return cls._execute_inner_function( 106 | grouping_data, target_fields=target_fields, old_data=data, **kwargs 107 | ) 108 | 109 | def execute(self, data: ExperimentData) -> ExperimentData: 110 | group_field, target_fields = self._get_fields(data=data) 111 | self.key = str( 112 | target_fields[0] if len(target_fields) == 1 else (target_fields or "") 113 | ) 114 | if ( 115 | not target_fields and data.ds.tmp_roles 116 | ): # if the column is not suitable for the test, then the target will be empty, but if there is a role tempo, then this is normal behavior 117 | return data 118 | if group_field[0] in data.groups: # TODO: to recheck if this is a correct check 119 | grouping_data = list(data.groups[group_field[0]].items()) 120 | else: 121 | grouping_data = None 122 | t_data = deepcopy(data.ds) 123 | if target_fields[1] not in t_data.columns: 124 | t_data = t_data.add_column( 125 | data.additional_fields[target_fields[1]], 126 | role={target_fields[1]: TargetRole()}, 127 | ) 128 | compare_result = self.calc( 129 | data=t_data, 130 | group_field=group_field, 131 | target_fields=target_fields, 132 | grouping_data=grouping_data, 133 | ) 134 | return self._set_value(data, compare_result) 135 | -------------------------------------------------------------------------------- /hypex/ab.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Literal 4 | 5 | from .analyzers.ab import ABAnalyzer 6 | from .comparators import Chi2Test, GroupDifference, GroupSizes, TTest, UTest 7 | from .dataset import TargetRole, TreatmentRole 8 | from .experiments.base import Experiment, OnRoleExperiment 9 | from .ui.ab import ABOutput 10 | from .ui.base import ExperimentShell 11 | from .utils import ABNTestMethodsEnum 12 | 13 | 14 | class ABTest(ExperimentShell): 15 | """A class for conducting A/B tests with configurable statistical tests and multiple testing correction. 16 | 17 | This class provides functionality to run A/B tests with options for different statistical tests 18 | (t-test, u-test, chi-square test) and multiple testing correction methods. 19 | 20 | Args: 21 | additional_tests (Union[str, List[str], None], optional): Statistical test(s) to run in addition to 22 | the default group difference calculation. Valid options are "t-test", "u-test", and "chi2-test". 23 | Can be a single test name or list of test names. Defaults to ["t-test"]. 24 | multitest_method (str, optional): Method to use for multiple testing correction. Valid options are: 25 | "bonferroni", "sidak", "holm-sidak", "holm", "simes-hochberg", "hommel", "fdr_bh", "fdr_by", 26 | "fdr_tsbh", "fdr_tsbhy", "quantile". Defaults to "holm". 27 | 28 | For more information refer to the statsmodels documentation: 29 | https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html 30 | 31 | Examples 32 | -------- 33 | .. code-block:: python 34 | 35 | # Basic A/B test with default t-test 36 | ab_test = ABTest() 37 | results = ab_test.execute(data) 38 | 39 | # A/B test with multiple statistical tests 40 | ab_test = ABTest( 41 | additional_tests=["t-test", "chi2-test"], 42 | multitest_method="bonferroni" 43 | ) 44 | results = ab_test.execute(data) 45 | """ 46 | 47 | @staticmethod 48 | def _make_experiment(additional_tests, multitest_method): 49 | """Creates an experiment configuration with specified statistical tests. 50 | 51 | Args: 52 | Args: 53 | additional_tests (Union[str, List[str], None], optional): Statistical test(s) to run in addition to 54 | the default group difference calculation. Valid options are "t-test", "u-test", and "chi2-test". 55 | Can be a single test name or list of test names. Defaults to ["t-test"]. 56 | multitest_method (str, optional): Method to use for multiple testing correction. Valid options are: 57 | "bonferroni", "sidak", "holm-sidak", "holm", "simes-hochberg", "hommel", "fdr_bh", "fdr_by", 58 | "fdr_tsbh", "fdr_tsbhy", "quantile". Defaults to "holm". 59 | For more information refer to the statsmodels documentation: 60 | 61 | 62 | Returns: 63 | Experiment: Configured experiment object with specified tests and correction method. 64 | """ 65 | test_mapping = { 66 | "t-test": TTest(compare_by="groups", grouping_role=TreatmentRole()), 67 | "u-test": UTest(compare_by="groups", grouping_role=TreatmentRole()), 68 | "chi2-test": Chi2Test(compare_by="groups", grouping_role=TreatmentRole()), 69 | } 70 | on_role_executors = [GroupDifference(grouping_role=TreatmentRole())] 71 | additional_tests = ["t-test"] if additional_tests is None else additional_tests 72 | additional_tests = ( 73 | additional_tests 74 | if isinstance(additional_tests, list) 75 | else [additional_tests] 76 | ) 77 | for i in additional_tests: 78 | on_role_executors += [test_mapping[i]] 79 | return Experiment( 80 | executors=[ 81 | GroupSizes(grouping_role=TreatmentRole()), 82 | OnRoleExperiment( 83 | executors=on_role_executors, 84 | role=TargetRole(), 85 | ), 86 | ABAnalyzer( 87 | multitest_method=( 88 | ABNTestMethodsEnum(multitest_method) 89 | if multitest_method 90 | else None 91 | ) 92 | ), 93 | ] 94 | ) 95 | 96 | def __init__( 97 | self, 98 | additional_tests: ( 99 | Literal["t-test", "u-test", "chi2-test"] 100 | | list[Literal["t-test", "u-test", "chi2-test"]] 101 | | None 102 | ) = None, 103 | multitest_method: ( 104 | Literal[ 105 | "bonferroni", 106 | "sidak", 107 | "holm-sidak", 108 | "holm", 109 | "simes-hochberg", 110 | "hommel", 111 | "fdr_bh", 112 | "fdr_by", 113 | "fdr_tsbh", 114 | "fdr_tsbhy", 115 | "quantile", 116 | ] 117 | | None 118 | ) = "holm", 119 | t_test_equal_var: bool | None = None, 120 | ): 121 | super().__init__( 122 | experiment=self._make_experiment(additional_tests, multitest_method), 123 | output=ABOutput(), 124 | ) 125 | if t_test_equal_var is not None: 126 | self.experiment.set_params({TTest: {"calc_kwargs": {"equal_var": t_test_equal_var}}}) 127 | -------------------------------------------------------------------------------- /hypex/reporters/abstract.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import Any 5 | 6 | from ..dataset import Dataset, ExperimentData 7 | from ..dataset.roles import InfoRole, ReportRole, TreatmentRole 8 | from ..utils import ID_SPLIT_SYMBOL, ExperimentDataEnum 9 | from ..utils.errors import AbstractMethodError 10 | 11 | 12 | class Reporter(ABC): 13 | @abstractmethod 14 | def report(self, data: ExperimentData): 15 | raise AbstractMethodError 16 | 17 | 18 | class DictReporter(Reporter, ABC): 19 | def __init__(self, front=True): 20 | self.front = front 21 | 22 | @staticmethod 23 | def extract_from_one_row_dataset(data: Dataset) -> dict[str, Any]: 24 | return {k: v[0] for k, v in data.to_dict()["data"]["data"].items()} 25 | 26 | def _extract_from_comparator(self, data: ExperimentData, comparator_id: str): 27 | result = {} 28 | field = comparator_id[comparator_id.rfind(ID_SPLIT_SYMBOL) + 1 :] 29 | executor_name = comparator_id[: comparator_id.find(ID_SPLIT_SYMBOL)] 30 | sep = " " if self.front else ID_SPLIT_SYMBOL 31 | analysis_dict = data.analysis_tables[comparator_id].to_dict()["data"] 32 | for i, index_value in enumerate(analysis_dict["index"]): 33 | for k, v in analysis_dict["data"].items(): 34 | key = sep.join( 35 | [field, executor_name, k, str(index_value)] 36 | if field 37 | else [executor_name, k, str(index_value)] 38 | ) 39 | result[key] = v[i] 40 | return result 41 | 42 | def _extract_from_comparators( 43 | self, data: ExperimentData, comparator_ids: list[str] 44 | ) -> dict[str, Any]: 45 | result = {} 46 | for comparator_id in comparator_ids: 47 | result.update(self._extract_from_comparator(data, comparator_id)) 48 | return result 49 | 50 | @abstractmethod 51 | def report(self, data: ExperimentData) -> dict: 52 | raise AbstractMethodError 53 | 54 | 55 | class OnDictReporter(Reporter, ABC): 56 | def __init__(self, dict_reporter: DictReporter) -> None: 57 | self.dict_reporter = dict_reporter 58 | 59 | 60 | class DatasetReporter(OnDictReporter): 61 | def report(self, data: ExperimentData) -> dict[str, Dataset] | Dataset: 62 | dict_result = self.dict_reporter.report(data) 63 | return self.convert_to_dataset( 64 | dict_result 65 | ) # TODO: change to DatasetAdapter.to_dataset() 66 | 67 | @staticmethod 68 | def convert_to_dataset(data: dict) -> dict[str, Dataset] | Dataset: 69 | return Dataset.from_dict(roles={k: ReportRole() for k in data}, data=[data]) 70 | 71 | 72 | class TestDictReporter(DictReporter): 73 | @staticmethod 74 | def _get_struct_dict(data: dict): 75 | dict_result = {} 76 | for key, value in data.items(): 77 | if ID_SPLIT_SYMBOL in key: 78 | key_split = key.split(ID_SPLIT_SYMBOL) 79 | if key_split[2] in ("pass", "p-value", "difference", "difference %", "control mean", "test mean"): 80 | if key_split[0] not in dict_result: 81 | dict_result[key_split[0]] = { 82 | key_split[3]: {key_split[1]: {key_split[2]: value}} 83 | } 84 | elif key_split[3] not in dict_result[key_split[0]]: 85 | dict_result[key_split[0]][key_split[3]] = { 86 | key_split[1]: {key_split[2]: value} 87 | } 88 | elif key_split[1] not in dict_result[key_split[0]][key_split[3]]: 89 | dict_result[key_split[0]][key_split[3]][key_split[1]] = { 90 | key_split[2]: value 91 | } 92 | else: 93 | dict_result[key_split[0]][key_split[3]][key_split[1]][ 94 | key_split[2] 95 | ] = value 96 | return dict_result 97 | 98 | @staticmethod 99 | def _convert_struct_dict_to_dataset(data: dict) -> Dataset: 100 | def rename_passed(data: dict[str, bool]): 101 | return { 102 | c: ( 103 | ("NOT OK" if (v is True or v == "True") else "OK") 104 | if "pass" in c 105 | else v 106 | ) 107 | for c, v in data.items() 108 | } 109 | 110 | result = [] 111 | for feature, groups in data.items(): 112 | for group, tests in groups.items(): 113 | t_values = {"feature": feature, "group": group} 114 | for test, values in tests.items(): 115 | if test == "GroupDifference": 116 | t_values["control mean"] = values.get("control mean") 117 | t_values["test mean"] = values.get("test mean") 118 | t_values["difference"] = values.get("difference") 119 | t_values["difference %"] = values.get("difference %") 120 | else: 121 | t_values[f"{test} pass"] = values.get("pass") 122 | t_values[f"{test} p-value"] = values.get("p-value") 123 | result.append(t_values) 124 | result = [rename_passed(d) for d in result] 125 | return Dataset.from_dict( 126 | result, 127 | roles={"feature": InfoRole(), "group": TreatmentRole()}, 128 | ) 129 | 130 | def extract_tests(self, data: ExperimentData) -> dict[str, Any]: 131 | test_ids = data.get_ids( 132 | self.tests, searched_space=ExperimentDataEnum.analysis_tables 133 | ) 134 | result = {} 135 | for class_, ids in test_ids.items(): 136 | result.update( 137 | self._extract_from_comparators( 138 | data, ids[ExperimentDataEnum.analysis_tables.value] 139 | ) 140 | ) 141 | return {k: v for k, v in result.items() if "pass" in k or "p-value" in k} 142 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | import datetime 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | import os 16 | import sys 17 | 18 | CURR_PATH = os.path.abspath(os.path.dirname(__file__)) 19 | LIB_PATH = os.path.join(CURR_PATH, os.path.pardir) 20 | sys.path.insert(0, LIB_PATH) 21 | 22 | project = "HypEx" 23 | copyright = f"{datetime.datetime.now().year}, AI Lab ML Tools" 24 | author = "AI Lab ML Tools" 25 | 26 | os.environ["DOCUMENTATION_ENV"] = "True" 27 | 28 | extensions = [ 29 | "sphinx.ext.autodoc", 30 | "sphinx.ext.autosummary", # will be used for tables 31 | "sphinx.ext.intersphinx", 32 | "sphinx.ext.napoleon", # structure 33 | "sphinx.ext.viewcode", # for [source] button 34 | "nbsphinx", 35 | "nbsphinx_link", 36 | "sphinx_autodoc_typehints", 37 | "IPython.sphinxext.ipython_console_highlighting", 38 | ] 39 | 40 | exclude_patterns = [ 41 | "_build/*", 42 | "**.ipynb_checkpoints", 43 | "Thumbs.db", 44 | ".DS_Store", 45 | ] 46 | 47 | # Delete external references 48 | autosummary_mock_imports = [ 49 | "numpy", 50 | "pandas", 51 | "scipy", 52 | "sklearn", 53 | "networkx", 54 | "holidays", 55 | "joblib", 56 | "yaml", 57 | "gensim", 58 | "PIL", 59 | "albumentations", 60 | "tqdm", 61 | "matplotlib", 62 | "seaborn", 63 | "json2html", 64 | "faiss", 65 | "statsmodels", 66 | ] 67 | 68 | # Add any paths that contain templates here, relative to this directory. 69 | templates_path = ["_templates"] 70 | 71 | # -- Options for HTML output ------------------------------------------------- 72 | 73 | # The theme to use for HTML and HTML Help pages. See the documentation for 74 | # a list of builtin themes. 75 | html_theme = "sphinx_rtd_theme" 76 | highlight_language = "python" 77 | 78 | html_theme_options = { 79 | 'logo_only': False, 80 | 'prev_next_buttons_location': 'bottom', 81 | 'style_external_links': True, 82 | 'vcs_pageview_mode': 'blob', 83 | 'style_nav_header_background': '#2980B9', 84 | # Toc options 85 | 'collapse_navigation': True, 86 | 'sticky_navigation': True, 87 | 'navigation_depth': 4, 88 | 'includehidden': True, 89 | 'titles_only': False, 90 | 'globaltoc_collapse': True, 91 | 'globaltoc_maxdepth': 3, 92 | } 93 | 94 | # Add any paths that contain custom static files (such as style sheets) here, 95 | # relative to this directory. They are copied after the builtin static files, 96 | # so a file named "default.css" will overwrite the builtin "default.css". 97 | html_static_path = ["_static"] 98 | 99 | html_css_files = [ 100 | 'custom.css', 101 | ] 102 | 103 | html_show_sourcelink = False 104 | html_sidebars = { 105 | '**': [ 106 | 'globaltoc.html', 107 | 'relations.html', 108 | 'sourcelink.html', 109 | 'searchbox.html', 110 | ] 111 | } 112 | 113 | # code style 114 | pygments_style = "sphinx" 115 | 116 | nbsphinx_execute = "never" 117 | 118 | # autodoc 119 | # function names that will not be included in documentation 120 | EXCLUDED_MEMBERS = ",".join( 121 | [ 122 | "get_own_record_history_wrapper", 123 | "get_record_history_wrapper", 124 | "record_history_omit", 125 | "record_history_only", 126 | ] 127 | ) 128 | 129 | autodoc_default_options = { 130 | "ignore-module-all": True, 131 | "show-inheritance": True, 132 | "exclude-members": EXCLUDED_MEMBERS, 133 | 'inherited-members': False, 134 | } 135 | 136 | # order of members in docs, usefully for methods in class 137 | autodoc_member_order = "bysource" 138 | 139 | # typing, use in signature 140 | autodoc_typehints = "none" 141 | 142 | # to omit some __init__ methods in classes where it not defined 143 | autoclass_content = "class" 144 | 145 | # all warnings will be produced as errors 146 | autodoc_warningiserror = True 147 | 148 | # when there is a link to function not use parentheses 149 | add_function_parentheses = False 150 | 151 | # napoleon 152 | # in this docs google docstring format used 153 | napoleon_google_docstring = True 154 | napoleon_numpy_docstring = False 155 | 156 | napoleon_include_init_with_doc = True 157 | 158 | # to omit private members 159 | napoleon_include_private_with_doc = False 160 | 161 | # use spectial members 162 | napoleon_include_special_with_doc = False 163 | 164 | napoleon_use_param = True 165 | 166 | # True to use a :keyword: role for each function keyword argument 167 | napoleon_use_keyword = True 168 | 169 | # True to use the .. admonition:: directive for References sections instead .. rubric:: 170 | napoleon_use_admonition_for_examples = True 171 | 172 | # Autosummary true if you want to generate it from very beginning 173 | autosummary_generate = True 174 | 175 | set_type_checking_flag = True 176 | 177 | always_document_param_types = False 178 | 179 | intersphinx_mapping = { 180 | "python": ("https://docs.python.org/3", None), 181 | "numpy": ("https://numpy.org/doc/stable", None), 182 | "scipy": ("https://docs.scipy.org/doc/scipy/", None), 183 | "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), 184 | "sklearn": ("https://scikit-learn.org/stable/", None), 185 | "PIL": ("https://pillow.readthedocs.io/en/stable/", None), 186 | } 187 | 188 | 189 | # autodoc_type_aliases = { 190 | # "RoleType": "lightautoml.dataset.roles.ColumnRole", 191 | # "NpDataset": "lightautoml.text.utils.NpDataset", 192 | # } 193 | 194 | 195 | def skip_member(app, what, name, obj, skip, options): 196 | if obj.__doc__ is None: 197 | return True 198 | return None 199 | 200 | 201 | def setup(app): 202 | app.add_css_file("style.css") # customizing default theme 203 | app.connect("autodoc-skip-member", skip_member) 204 | -------------------------------------------------------------------------------- /hypex/analyzers/ab.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from copy import deepcopy 4 | from typing import Any 5 | 6 | from ..comparators import TTest, UTest 7 | from ..dataset import Dataset, ExperimentData, StatisticRole, TargetRole, TreatmentRole 8 | from ..experiments.base import Executor 9 | from ..extensions.statsmodels import MultiTest, MultitestQuantile 10 | from ..utils import ( 11 | ID_SPLIT_SYMBOL, 12 | NAME_BORDER_SYMBOL, 13 | ABNTestMethodsEnum, 14 | BackendsEnum, 15 | ExperimentDataEnum, 16 | ) 17 | 18 | 19 | class ABAnalyzer(Executor): 20 | def __init__( 21 | self, 22 | multitest_method: ABNTestMethodsEnum | None = None, 23 | alpha: float = 0.05, 24 | equal_variance: bool = True, 25 | quantiles: float | list[float] | None = None, 26 | iteration_size: int = 20000, 27 | random_state: int | None = None, 28 | key: Any = "", 29 | ): 30 | self.multitest_method = multitest_method 31 | self.alpha = alpha 32 | self.equal_variance = equal_variance 33 | self.quantiles = quantiles 34 | self.iteration_size = iteration_size 35 | self.random_state = random_state 36 | super().__init__(key) 37 | 38 | def _set_value(self, data: ExperimentData, value, key=None) -> ExperimentData: 39 | return data.set_value( 40 | ExperimentDataEnum.analysis_tables, 41 | self.id + key if key else self.id, 42 | value, 43 | ) 44 | 45 | def execute_multitest(self, data: ExperimentData, p_values: Dataset, **kwargs): 46 | group_field = data.ds.search_columns(TreatmentRole())[0] 47 | target_fields = data.ds.search_columns(TargetRole(), search_types=[int, float]) 48 | if self.multitest_method and len(data.groups[group_field]) > 2: 49 | if self.multitest_method != ABNTestMethodsEnum.quantile: 50 | multitest_result = MultiTest(self.multitest_method).calc( 51 | p_values, **kwargs 52 | ) 53 | groups = [] 54 | for i in list(data.groups[group_field].keys())[1:]: 55 | groups += [i] * len(target_fields) 56 | multitest_result = multitest_result.add_column( 57 | groups 58 | * ( 59 | len(multitest_result) 60 | // len(target_fields) 61 | // (len(data.groups[group_field]) - 1) 62 | ), 63 | role={"group": StatisticRole()}, 64 | ) 65 | 66 | else: 67 | multitest_result = Dataset.create_empty() 68 | for target_field in target_fields: 69 | multitest_result = multitest_result.append( 70 | MultitestQuantile( 71 | self.alpha, 72 | self.iteration_size, 73 | self.equal_variance, 74 | self.random_state, 75 | ).calc( 76 | p_values, 77 | group_field=group_field, 78 | target_field=target_field, 79 | quantiles=self.quantiles, 80 | ) 81 | ) 82 | return self._set_value(data, multitest_result, key="MultiTest") 83 | return data 84 | 85 | def _add_pvalues(self, multitest_pvalues, value, field): 86 | if ( 87 | self.multitest_method 88 | and field == "p-value" 89 | and self.multitest_method != "quantile" 90 | ): 91 | multitest_pvalues = multitest_pvalues.append(value) 92 | return multitest_pvalues 93 | 94 | def execute(self, data: ExperimentData) -> ExperimentData: 95 | executor_ids = data.get_ids([TTest, UTest]) 96 | num_groups = len(data.groups[data.ds.search_columns(TreatmentRole())[0]]) - 1 97 | groups = list(data.groups[data.ds.search_columns(TreatmentRole())[0]].items()) 98 | multitest_pvalues = Dataset.create_empty() 99 | analysis_data = {} 100 | for c, spaces in executor_ids.items(): 101 | analysis_ids = spaces.get("analysis_tables", []) 102 | if len(analysis_ids) == 0: 103 | continue 104 | t_data = deepcopy(data.analysis_tables[analysis_ids[0]]) 105 | for aid in analysis_ids[1:]: 106 | t_data = t_data.append(data.analysis_tables[aid]) 107 | if len(analysis_ids) < len(t_data): 108 | analysis_ids *= num_groups 109 | t_data.data.index = analysis_ids 110 | for f in ["p-value", "pass"]: 111 | for i in range(0, len(analysis_ids), len(analysis_ids) // num_groups): 112 | value = t_data.iloc[i : i + len(analysis_ids) // num_groups][f] 113 | multitest_pvalues = self._add_pvalues(multitest_pvalues, value, f) 114 | analysis_data[f"{c} {f} {groups[i // num_groups + 1][0]}"] = ( 115 | value.mean() 116 | ) 117 | if c not in ["UTest", "TTest"]: 118 | indexes = t_data.index 119 | values = t_data.data.values.tolist() 120 | for idx, value in zip(indexes, values): 121 | name = idx.split(ID_SPLIT_SYMBOL)[-1] 122 | analysis_data[ 123 | f"{c} {name[name.find(NAME_BORDER_SYMBOL) + 1 : name.rfind(NAME_BORDER_SYMBOL)]}" 124 | ] = value[0] 125 | 126 | analysis_dataset = Dataset.from_dict( 127 | [analysis_data], 128 | {f: StatisticRole() for f in analysis_data}, 129 | BackendsEnum.pandas, 130 | ) 131 | data = self.execute_multitest( 132 | data, 133 | ( 134 | multitest_pvalues 135 | if not multitest_pvalues.is_empty() 136 | and self.multitest_method != ABNTestMethodsEnum.quantile 137 | else data.ds 138 | ), 139 | ) 140 | 141 | return self._set_value(data, analysis_dataset) 142 | -------------------------------------------------------------------------------- /hypex/ml/faiss.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Literal 4 | 5 | from ..comparators.distances import MahalanobisDistance 6 | from ..dataset import ( 7 | ABCRole, 8 | AdditionalMatchingRole, 9 | Dataset, 10 | ExperimentData, 11 | FeatureRole, 12 | ) 13 | from ..executor import MLExecutor 14 | from ..extensions.faiss import FaissExtension 15 | from ..utils import ExperimentDataEnum 16 | from ..utils.errors import PairsNotFoundError 17 | 18 | 19 | class FaissNearestNeighbors(MLExecutor): 20 | def __init__( 21 | self, 22 | n_neighbors: int = 1, 23 | two_sides: bool = False, 24 | test_pairs: bool = False, 25 | grouping_role: ABCRole | None = None, 26 | key: Any = "", 27 | faiss_mode: Literal["base", "fast", "auto"] = "auto", 28 | ): 29 | self.n_neighbors = n_neighbors 30 | self.two_sides = two_sides 31 | self.test_pairs = test_pairs 32 | self.faiss_mode = faiss_mode 33 | super().__init__( 34 | grouping_role=grouping_role, target_role=FeatureRole(), key=key 35 | ) 36 | 37 | @classmethod 38 | def _execute_inner_function( 39 | cls, 40 | grouping_data, 41 | target_field: str | None = None, 42 | n_neighbors: int | None = None, 43 | two_sides: bool | None = None, 44 | test_pairs: bool | None = None, 45 | faiss_mode: Literal["base", "fast", "auto"] = "auto", 46 | **kwargs, 47 | ) -> dict: 48 | if test_pairs is not True: 49 | data = cls._inner_function( 50 | data=grouping_data[0][1], 51 | test_data=grouping_data[1][1], 52 | n_neighbors=n_neighbors or 1, 53 | faiss_mode=faiss_mode, 54 | **kwargs, 55 | ) 56 | if two_sides is not True: 57 | return {"test": data} 58 | return { 59 | "test": data, 60 | "control": cls._inner_function( 61 | data=grouping_data[1][1], 62 | test_data=grouping_data[0][1], 63 | n_neighbors=n_neighbors or 1, 64 | faiss_mode=faiss_mode, 65 | **kwargs, 66 | ), 67 | } 68 | data = cls._inner_function( 69 | data=grouping_data[1][1], 70 | test_data=grouping_data[0][1], 71 | n_neighbors=n_neighbors or 1, 72 | faiss_mode=faiss_mode, 73 | **kwargs, 74 | ) 75 | if two_sides is not True: 76 | return {"control": data} 77 | return { 78 | "control": data, 79 | "test": cls._inner_function( 80 | data=grouping_data[1][1], 81 | test_data=grouping_data[0][1], 82 | n_neighbors=n_neighbors or 1, 83 | faiss_mode=faiss_mode, 84 | **kwargs, 85 | ), 86 | } 87 | 88 | @classmethod 89 | def _inner_function( 90 | cls, 91 | data: Dataset, 92 | test_data: Dataset | None = None, 93 | target_data: Dataset | None = None, 94 | n_neighbors: int | None = None, 95 | faiss_mode: Literal["base", "fast", "auto"] = "auto", 96 | **kwargs, 97 | ) -> Any: 98 | return FaissExtension(n_neighbors=n_neighbors or 1, faiss_mode=faiss_mode).calc( 99 | data=data, test_data=test_data 100 | ) 101 | 102 | def fit(self, X: Dataset, Y: Dataset | None = None) -> MLExecutor: 103 | return FaissExtension(self.n_neighbors, self.faiss_mode).fit(X=X, Y=Y) 104 | 105 | def predict(self, X: Dataset) -> Dataset: 106 | return FaissExtension().predict(X) 107 | 108 | def execute(self, data: ExperimentData) -> ExperimentData: 109 | group_field, features_fields = self._get_fields(data=data) 110 | if group_field[0] in data.groups: 111 | grouping_data = list(data.groups[group_field[0]].items()) 112 | else: 113 | grouping_data = data.ds.groupby(group_field, fields_list=features_fields) 114 | distances_keys = data.get_ids(MahalanobisDistance, ExperimentDataEnum.groups) 115 | if len(distances_keys["MahalanobisDistance"]["groups"]) > 0: 116 | grouping_data = list( 117 | data.groups[distances_keys["MahalanobisDistance"]["groups"][0]].items() 118 | ) 119 | compare_result = self.calc( 120 | data=data.ds, 121 | group_field=group_field, 122 | grouping_data=grouping_data, 123 | features_fields=features_fields, 124 | n_neighbors=self.n_neighbors, 125 | faiss_mode=self.faiss_mode, 126 | two_sides=self.two_sides, 127 | test_pairs=self.test_pairs, 128 | ) 129 | ds = data.ds.groupby(group_field) 130 | matched_indexes = Dataset.create_empty() 131 | for i in range(len(compare_result.columns)): 132 | group = ( 133 | grouping_data[1][1] 134 | if compare_result.columns[i] == "test" 135 | else grouping_data[0][1] 136 | ) 137 | t_ds = ds[0][1] if compare_result.columns[i] == "test" else ds[1][1] 138 | t_index_field = ( 139 | compare_result[compare_result.columns[i]] 140 | .loc[: len(group) - 1] 141 | .rename({compare_result.columns[i]: "indexes"}) 142 | ) 143 | if t_index_field.isna().sum() > 0: 144 | raise PairsNotFoundError 145 | matched_indexes = matched_indexes.append( 146 | Dataset.from_dict( 147 | data={ 148 | "indexes": t_ds.iloc[ 149 | list(map(lambda x: int(x[0]), t_index_field.get_values())) 150 | ].index 151 | }, 152 | roles={"indexes": AdditionalMatchingRole()}, 153 | index=group.index, 154 | ) 155 | ).sort() 156 | if len(matched_indexes) < len(data.ds) and not self.two_sides: 157 | matched_indexes = matched_indexes.reindex(data.ds.index, fill_value=-1) 158 | elif len(matched_indexes) < len(data.ds) and self.two_sides: 159 | raise PairsNotFoundError 160 | return self._set_value(data, matched_indexes, key="matched") 161 | -------------------------------------------------------------------------------- /hypex/extensions/scipy_stats.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import warnings 4 | from typing import Callable 5 | 6 | from scipy.stats import ( # type: ignore 7 | chi2_contingency, 8 | ks_2samp, 9 | mannwhitneyu, 10 | norm, 11 | ttest_ind, 12 | ) 13 | 14 | from ..dataset import Dataset, DatasetAdapter, StatisticRole 15 | from .abstract import CompareExtension 16 | 17 | 18 | class StatTest(CompareExtension): 19 | def __init__( 20 | self, test_function: Callable | None = None, reliability: float = 0.05 21 | ): 22 | super().__init__() 23 | self.test_function = test_function 24 | self.reliability = reliability 25 | 26 | @staticmethod # TODO: remove 27 | def check_other(other: Dataset | None) -> Dataset: 28 | if other is None: 29 | raise ValueError("No other dataset provided") 30 | return other 31 | 32 | @staticmethod 33 | def check_dataset(data: Dataset): 34 | if len(data.columns) != 1: 35 | raise ValueError("Data must be one-dimensional") 36 | 37 | def check_data(self, data: Dataset, other: Dataset | None) -> Dataset: 38 | other = self.check_other(other) 39 | 40 | self.check_dataset(data) 41 | self.check_dataset(other) 42 | 43 | return other 44 | 45 | def _calc_pandas( 46 | self, data: Dataset, other: Dataset | None = None, **kwargs 47 | ) -> Dataset | float: 48 | other = self.check_data(data, other) 49 | if self.test_function is None: 50 | raise ValueError("test_function is needed for execution") 51 | one_result = self.test_function( 52 | data.backend.data.values.flatten(), 53 | other.backend.data.values.flatten(), 54 | **kwargs, 55 | ) 56 | one_result = DatasetAdapter.to_dataset( 57 | { 58 | "p-value": one_result.pvalue, 59 | "statistic": one_result.statistic, 60 | "pass": one_result.pvalue < self.reliability, 61 | }, 62 | StatisticRole(), 63 | ) 64 | return one_result 65 | 66 | 67 | class TTestExtension(StatTest): 68 | def __init__(self, reliability: float = 0.05): 69 | super().__init__(ttest_ind, reliability=reliability) 70 | 71 | def _calc_pandas( 72 | self, data: Dataset, other: Dataset | None = None, **kwargs 73 | ) -> Dataset | float: 74 | # if ( 75 | # next(iter(data.nunique().values())) 76 | # and next(iter(other.nunique().values())) < 2 77 | # ): 78 | # return DatasetAdapter.to_dataset( 79 | # { 80 | # "p-value": [None], 81 | # "statistic": [None], 82 | # "pass": [None], 83 | # }, 84 | # StatisticRole(), 85 | # ) 86 | return super()._calc_pandas(data, other, nan_policy="omit", **kwargs) 87 | 88 | 89 | class KSTestExtension(StatTest): 90 | def __init__(self, reliability: float = 0.05): 91 | super().__init__(ks_2samp, reliability=reliability) 92 | 93 | 94 | class UTestExtension(StatTest): 95 | def __init__(self, reliability: float = 0.05): 96 | super().__init__(mannwhitneyu, reliability=reliability) 97 | 98 | 99 | class Chi2TestExtension(StatTest): 100 | @staticmethod 101 | def mini_category_replace(counts: Dataset) -> Dataset: 102 | mini_counts = counts["count"][counts["count"] < 7] 103 | if len(mini_counts) > 0: 104 | counts = counts.append( 105 | Dataset.from_dict( 106 | [{counts.columns[0]: "other", "count": mini_counts["count"].sum()}], 107 | roles=mini_counts.roles, 108 | ) 109 | ) 110 | counts = counts[counts["count"] >= 7] 111 | return counts 112 | 113 | def matrix_preparation(self, data: Dataset, other: Dataset) -> Dataset | None: 114 | proportion = len(data) / (len(data) + len(other)) 115 | counted_data = data.value_counts() 116 | counted_data = self.mini_category_replace(counted_data) 117 | data_vc = counted_data["count"] * (1 - proportion) 118 | 119 | counted_other = other.value_counts() 120 | counted_other = self.mini_category_replace(counted_other) 121 | other_vc = counted_other["count"] * proportion 122 | 123 | if len(counted_data) < 2: 124 | return None 125 | data_vc = data_vc.add_column(counted_data[counted_data.columns[0]]) 126 | other_vc = other_vc.add_column(counted_data[counted_data.columns[0]]) 127 | return data_vc.merge(other_vc, on=counted_data.columns[0])[ 128 | ["count_x", "count_y"] 129 | ].fillna(0) 130 | 131 | def _calc_pandas( 132 | self, data: Dataset, other: Dataset | None = None, **kwargs 133 | ) -> Dataset | float: 134 | other = self.check_data(data, other) 135 | matrix = self.matrix_preparation(data, other) 136 | if matrix is None: 137 | warnings.warn(f"Matrix Chi2 is empty for {data.columns[0]}. Returning None") 138 | return DatasetAdapter.to_dataset( 139 | { 140 | "p-value": [None], 141 | "statistic": [None], 142 | "pass": [None], 143 | }, 144 | StatisticRole(), 145 | ) 146 | one_result = chi2_contingency(matrix.backend.data) 147 | return DatasetAdapter.to_dataset( 148 | { 149 | "p-value": ( 150 | one_result[1] 151 | if isinstance(one_result, tuple) 152 | else one_result.pvalue 153 | ), 154 | "statistic": ( 155 | one_result[0] 156 | if isinstance(one_result, tuple) 157 | else one_result.statistic 158 | ), 159 | "pass": ( 160 | one_result[1] 161 | if isinstance(one_result, tuple) 162 | else one_result.pvalue 163 | ) 164 | < self.reliability, 165 | }, 166 | StatisticRole(), 167 | ) 168 | 169 | 170 | class NormCDF(StatTest): 171 | def _calc_pandas( 172 | self, data: Dataset, other: Dataset | None = None, **kwargs 173 | ) -> Dataset | float: 174 | result = norm.cdf(abs(data.get_values()[0][0])) 175 | return DatasetAdapter.to_dataset( 176 | {"p-value": 2 * (1 - result)}, 177 | StatisticRole(), 178 | ) 179 | -------------------------------------------------------------------------------- /hypex/experiments/base_complex.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from itertools import product 4 | from typing import Any, Sequence 5 | 6 | from tqdm import tqdm 7 | 8 | from ..dataset import ABCRole, Dataset, ExperimentData, GroupingRole 9 | from ..executor import Executor, IfExecutor 10 | from ..reporters import DatasetReporter, Reporter 11 | from ..utils.enums import ExperimentDataEnum 12 | from .base import Experiment 13 | 14 | 15 | class ExperimentWithReporter(Experiment): 16 | def __init__( 17 | self, 18 | executors: Sequence[Executor], 19 | reporter: Reporter, 20 | transformer: bool | None = None, 21 | key: str = "", 22 | ): 23 | super().__init__(executors, transformer, key) 24 | self.reporter = reporter 25 | 26 | def one_iteration( 27 | self, data: ExperimentData, key: str = "", set_key_as_index: bool = False 28 | ): 29 | t_data = ExperimentData(data.ds) 30 | self.key = key 31 | t_data = super().execute(t_data) 32 | result = self.reporter.report(t_data) 33 | if set_key_as_index: 34 | result.index = [key] 35 | return result 36 | 37 | def _set_result( 38 | self, data: ExperimentData, result: list[Dataset], reset_index: bool = True 39 | ): 40 | result = ( 41 | result[0].append(result[1:], reset_index=reset_index) 42 | if len(result) > 1 43 | else result[0] 44 | ) 45 | return self._set_value(data, result) 46 | 47 | 48 | class CycledExperiment(ExperimentWithReporter): 49 | def __init__( 50 | self, 51 | executors: list[Executor], 52 | reporter: DatasetReporter, 53 | n_iterations: int, 54 | transformer: bool | None = None, 55 | key: str = "", 56 | ): 57 | super().__init__(executors, reporter, transformer, key) 58 | self.n_iterations: int = n_iterations 59 | 60 | def generate_params_hash(self) -> str: 61 | return f"{self.reporter.__class__.__name__} x {self.n_iterations}" 62 | 63 | def execute(self, data: ExperimentData) -> ExperimentData: 64 | result: list[Dataset] = [ 65 | self.one_iteration(data, str(i)) for i in tqdm(range(self.n_iterations)) 66 | ] 67 | return self._set_result(data, result) 68 | 69 | 70 | class GroupExperiment(ExperimentWithReporter): 71 | def __init__( 72 | self, 73 | executors: Sequence[Executor], 74 | reporter: Reporter, 75 | searching_role: ABCRole = GroupingRole(), 76 | transformer: bool | None = None, 77 | key: str = "", 78 | ): 79 | self.searching_role = searching_role 80 | super().__init__(executors, reporter, transformer, key) 81 | 82 | def generate_params_hash(self) -> str: 83 | return f"GroupExperiment: {self.reporter.__class__.__name__}" 84 | 85 | def execute(self, data: ExperimentData) -> ExperimentData: 86 | group_field = data.ds.search_columns(self.searching_role) 87 | result: list[Dataset] = [ 88 | self.one_iteration( 89 | ExperimentData(group_data), str(group[0]), set_key_as_index=True 90 | ) 91 | for group, group_data in tqdm(data.ds.groupby(group_field)) 92 | ] 93 | return self._set_result(data, result, reset_index=False) 94 | 95 | 96 | class ParamsExperiment(ExperimentWithReporter): 97 | def __init__( 98 | self, 99 | executors: Sequence[Executor], 100 | reporter: DatasetReporter, 101 | params: dict[type, dict[str, Sequence[Any]]], 102 | transformer: bool | None = None, 103 | key: str = "", 104 | ): 105 | super().__init__(executors, reporter, transformer, key) 106 | self._params = params 107 | self._flat_params: list[dict[type, dict[str, Any]]] = [] 108 | 109 | def generate_params_hash(self) -> str: 110 | return f"ParamsExperiment: {self.reporter.__class__.__name__}" 111 | 112 | def _update_flat_params(self): 113 | classes = list(self._params) 114 | param_combinations = [ 115 | list( 116 | product( 117 | *[ 118 | product([parameter], values) 119 | for parameter, values in class_params.items() 120 | ] 121 | ) 122 | ) 123 | for class_params in self._params.values() 124 | ] 125 | new_flat_params = [ 126 | { 127 | classes[i]: dict(param_combination[i]) 128 | for i in range(len(param_combination)) 129 | } 130 | for param_combination in product(*param_combinations) 131 | ] 132 | self._flat_params = new_flat_params 133 | 134 | @property 135 | def flat_params(self) -> list[dict[type, dict[str, Any]]]: 136 | return self._flat_params 137 | 138 | @property 139 | def params(self) -> dict[type, dict[str, Sequence[Any]]]: 140 | return self._params 141 | 142 | @params.setter 143 | def params(self, params: dict[type, dict[str, Sequence[Any]]]): 144 | self._params = params 145 | self._update_flat_params() 146 | 147 | def execute(self, data: ExperimentData) -> ExperimentData: 148 | results = [] 149 | self._update_flat_params() 150 | for flat_param in tqdm(self._flat_params): 151 | t_data = ExperimentData(data.ds) 152 | for executor in self.executors: 153 | executor.set_params(flat_param) 154 | t_data = executor.execute(t_data) 155 | report = self.reporter.report(t_data) 156 | results.append(report) 157 | return self._set_result(data, results) 158 | 159 | 160 | class IfParamsExperiment(ParamsExperiment): 161 | def __init__( 162 | self, 163 | executors: Sequence[Executor], 164 | reporter: DatasetReporter, 165 | params: dict[type, dict[str, Sequence[Any]]], 166 | stopping_criterion: IfExecutor, 167 | transformer: bool | None = None, 168 | key: str = "", 169 | ): 170 | self.stopping_criterion = stopping_criterion 171 | super().__init__(executors, reporter, params, transformer, key) 172 | 173 | def execute(self, data: ExperimentData) -> ExperimentData: 174 | self._update_flat_params() 175 | for flat_param in tqdm(self._flat_params): 176 | t_data = ExperimentData(data.ds) 177 | for executor in self.executors: 178 | executor.set_params(flat_param) 179 | t_data = executor.execute(t_data) 180 | if_result = self.stopping_criterion.execute(t_data) 181 | if_executor_id = if_result.get_one_id( 182 | self.stopping_criterion.__class__, ExperimentDataEnum.variables 183 | ) 184 | if if_result.variables[if_executor_id]["response"]: 185 | return self._set_result(data, [self.reporter.report(t_data)]) 186 | return data 187 | -------------------------------------------------------------------------------- /hypex/ui/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any 4 | 5 | from ..dataset import Dataset, ExperimentData 6 | from ..experiments.base import Experiment 7 | from ..reporters import Reporter 8 | from ..utils import ID_SPLIT_SYMBOL 9 | from ..utils.enums import RenameEnum 10 | 11 | 12 | class Output: 13 | """A class for handling experiment output reporting and formatting. 14 | 15 | This class manages the reporting and formatting of experiment results, allowing for both 16 | a primary resume report and additional custom reports. 17 | 18 | Attributes: 19 | resume (Dataset): The main summary report of the experiment results. 20 | _experiment_data (ExperimentData): Internal storage of the experiment data. 21 | 22 | Args: 23 | resume_reporter (Reporter): The main reporter that generates the resume output. 24 | additional_reporters (Optional[Dict[str, Reporter]]): Dictionary mapping attribute 25 | names to additional reporters for custom reporting. Defaults to None. 26 | 27 | Examples 28 | -------- 29 | .. code-block:: python 30 | 31 | # Basic usage with just a resume reporter 32 | from my_reporters import MyResumeReporter 33 | output = Output(resume_reporter=MyResumeReporter()) 34 | output.extract(experiment_data) 35 | print(output.resume) 36 | 37 | # Using additional custom reporters 38 | from my_reporters import StatsReporter, PlotReporter 39 | additional = { 40 | 'statistics': StatsReporter(), 41 | 'plots': PlotReporter() 42 | } 43 | output = Output( 44 | resume_reporter=MyResumeReporter(), 45 | additional_reporters=additional 46 | ) 47 | output.extract(experiment_data) 48 | print(output.statistics) # Access additional report 49 | print(output.plots) # Access additional report 50 | """ 51 | 52 | resume: Dataset 53 | _experiment_data: ExperimentData 54 | 55 | def __init__( 56 | self, 57 | resume_reporter: Reporter, 58 | additional_reporters: dict[str, Reporter] | None = None, 59 | ): 60 | self.resume_reporter = resume_reporter 61 | self.additional_reporters = additional_reporters or {} 62 | 63 | def _extract_by_reporters(self, experiment_data: ExperimentData): 64 | """Extracts reports from all configured reporters. 65 | 66 | Args: 67 | experiment_data (ExperimentData): The experiment data to generate reports from. 68 | """ 69 | self.resume = self.resume_reporter.report(experiment_data) 70 | for attribute, reporter in self.additional_reporters.items(): 71 | setattr(self, attribute, reporter.report(experiment_data)) 72 | self._experiment_data = experiment_data 73 | 74 | @staticmethod 75 | def _replace_splitters( 76 | data: Dataset, mode: RenameEnum = RenameEnum.columns 77 | ) -> Dataset: 78 | result = data 79 | if mode in (RenameEnum.all, RenameEnum.columns): 80 | result = result.rename( 81 | {c: c.replace(ID_SPLIT_SYMBOL, " ") for c in result.columns} 82 | ) 83 | if mode in (RenameEnum.all, RenameEnum.index): 84 | result.index = [i.replace(ID_SPLIT_SYMBOL, " ") for i in result.index] 85 | return result 86 | 87 | def extract(self, experiment_data: ExperimentData): 88 | """Extracts and processes all reports from the experiment data. 89 | 90 | Args: 91 | experiment_data (ExperimentData): The experiment data to generate reports from. 92 | 93 | Examples 94 | -------- 95 | .. code-block:: python 96 | 97 | output = Output(resume_reporter=MyReporter()) 98 | output.extract(experiment_data) 99 | print(output.resume) # Access the main report 100 | """ 101 | self._extract_by_reporters(experiment_data) 102 | 103 | 104 | class ExperimentShell: 105 | """Base class for experiment execution with configurable output handling. 106 | 107 | This class provides a shell for executing experiments with customizable parameters 108 | and output formatting. It serves as a base class for specific experiment types 109 | like A/B tests and A/A tests. 110 | 111 | Args: 112 | experiment (Experiment): The experiment configuration to execute. 113 | output (Output): Output handler that defines how results are formatted. 114 | experiment_params (Optional[Dict[str, Any]], optional): Additional parameters 115 | to configure the experiment. Defaults to None. 116 | 117 | Examples 118 | -------- 119 | .. code-block:: python 120 | 121 | # Basic usage with default parameters 122 | experiment = Experiment([...]) # Configure experiment 123 | output = Output(resume_reporter=MyReporter()) 124 | shell = ExperimentShell(experiment, output) 125 | results = shell.execute(data) 126 | 127 | # With custom experiment parameters 128 | params = { 129 | "random_state": 42, 130 | "test_size": 0.3 131 | } 132 | shell = ExperimentShell( 133 | experiment=experiment, 134 | output=output, 135 | experiment_params=params 136 | ) 137 | results = shell.execute(data) 138 | """ 139 | 140 | def __init__( 141 | self, 142 | experiment: Experiment, 143 | output: Output, 144 | experiment_params: dict[str, Any] | None = None, 145 | ): 146 | if experiment_params: 147 | experiment.set_params(experiment_params) 148 | self._out = output 149 | self._experiment = experiment 150 | 151 | @property 152 | def experiment(self): 153 | """Gets the configured experiment instance. 154 | 155 | Returns: 156 | Experiment: The experiment configuration object. 157 | """ 158 | return self._experiment 159 | 160 | def execute(self, data: Dataset | ExperimentData) -> Output: 161 | """Executes the experiment on the provided data. 162 | 163 | Runs the configured experiment on the input data and formats the results 164 | using the configured output handler. 165 | 166 | Args: 167 | data (Union[Dataset, ExperimentData]): Input data for the experiment. 168 | Can be either a Dataset or ExperimentData instance. 169 | 170 | Returns: 171 | Output: Formatted experiment results through the configured output handler. 172 | 173 | Examples 174 | -------- 175 | .. code-block:: python 176 | 177 | shell = ExperimentShell(experiment, output) 178 | dataset = Dataset(...) # Your input data 179 | results = shell.execute(dataset) 180 | print(results.resume) # Access formatted results 181 | """ 182 | if isinstance(data, Dataset): 183 | data = ExperimentData(data) 184 | result_experiment_data = self._experiment.execute(data) 185 | self._out.extract(result_experiment_data) 186 | return self._out 187 | -------------------------------------------------------------------------------- /hypex/splitters/aa.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from ..dataset import ( 9 | AdditionalTreatmentRole, 10 | Dataset, 11 | ExperimentData, 12 | StratificationRole, 13 | TreatmentRole, 14 | ) 15 | from ..dataset.roles import ConstGroupRole 16 | from ..executor import Calculator 17 | from ..utils import ExperimentDataEnum 18 | 19 | 20 | class AASplitter(Calculator): 21 | def __init__( 22 | self, 23 | control_size: float = 0.5, 24 | random_state: int | None = None, 25 | sample_size: float | None = None, 26 | constant_key: bool = True, 27 | save_groups: bool = True, 28 | key: Any = "", 29 | ): 30 | self.control_size = control_size 31 | self.random_state = random_state 32 | self._key = key 33 | self.constant_key = constant_key 34 | self.save_groups = save_groups 35 | self.sample_size = sample_size 36 | super().__init__(key) 37 | 38 | def _generate_params_hash(self): 39 | hash_parts: list[str] = [] 40 | if self.control_size != 0.5: 41 | hash_parts.append(f"cs {self.control_size}") 42 | if self.random_state is not None: 43 | hash_parts.append(f"rs {self.random_state}") 44 | self._params_hash = "|".join(hash_parts) 45 | 46 | def init_from_hash(self, params_hash: str): 47 | hash_parts: list[str] = params_hash.split("|") 48 | for hash_part in hash_parts: 49 | if hash_part.startswith("cs"): 50 | self.control_size = float(hash_part[hash_part.rfind(" ") + 1 :]) 51 | elif hash_part.startswith("rs"): 52 | self.random_state = int(hash_part[hash_part.rfind(" ") + 1 :]) 53 | self._generate_id() 54 | 55 | @property 56 | def key(self) -> Any: 57 | return self._key 58 | 59 | @key.setter 60 | def key(self, value: Any): 61 | if not self.constant_key: 62 | self._key = value 63 | self._generate_id() 64 | 65 | def _set_value(self, data: ExperimentData, value, key=None) -> ExperimentData: 66 | data = data.set_value( 67 | ExperimentDataEnum.additional_fields, 68 | self._id, 69 | value, 70 | role=AdditionalTreatmentRole(), 71 | ) 72 | 73 | if self.save_groups: 74 | data.groups[self.id] = { 75 | group: data.ds.loc[group_data.index] 76 | for group, group_data in data.additional_fields.groupby(self.id) 77 | } 78 | return data 79 | 80 | @staticmethod 81 | def _inner_function( 82 | data: Dataset, 83 | random_state: int | None = None, 84 | control_size: float = 0.5, 85 | sample_size: float | None = None, 86 | const_group_field: str | None = None, 87 | **kwargs, 88 | ) -> list[str]: 89 | sample_size = 1.0 if sample_size is None else sample_size 90 | control_indexes = [] 91 | if const_group_field: 92 | const_data = dict(data.groupby(const_group_field)) 93 | control_data = const_data.get("control") 94 | if control_data is not None: 95 | control_indexes = list(control_data.index) 96 | const_size = sum(len(cd) for cd in const_data.values()) 97 | control_size = (len(data) * control_size - const_size) / ( 98 | len(data) - const_size 99 | ) 100 | experiment_data = ( 101 | data[data[const_group_field].isna()] if const_group_field else data 102 | ) 103 | experiment_data_index = experiment_data.sample( 104 | frac=sample_size, random_state=random_state 105 | ).index 106 | addition_indexes = list(experiment_data_index) 107 | edge = int(len(addition_indexes) * control_size) 108 | control_indexes += addition_indexes[:edge] 109 | 110 | split_series = pd.Series(np.ones(data.data.shape[0], dtype="int"), index=data.data.index) 111 | split_series[control_indexes] -= 1 112 | split_series = split_series.map({0: "control", 1: "test"}) 113 | 114 | return split_series.to_list() 115 | 116 | def execute(self, data: ExperimentData) -> ExperimentData: 117 | const_group_fields = data.ds.search_columns(ConstGroupRole()) 118 | const_group_fields = ( 119 | const_group_fields[0] if len(const_group_fields) > 0 else None 120 | ) 121 | result = self.calc( 122 | data.ds, 123 | random_state=self.random_state, 124 | control_size=self.control_size, 125 | sample_size=self.sample_size, 126 | const_group_field=const_group_fields, 127 | ) 128 | return self._set_value( 129 | data, 130 | result, 131 | ) 132 | 133 | 134 | class AASplitterWithStratification(AASplitter): 135 | @staticmethod 136 | def _inner_function( 137 | data: Dataset, 138 | random_state: int | None = None, 139 | control_size: float = 0.5, 140 | grouping_fields=None, 141 | **kwargs, 142 | ) -> list[str] | Dataset: 143 | if not grouping_fields: 144 | return AASplitter._inner_function( 145 | data, random_state, control_size, **kwargs 146 | ) 147 | result = {"split": []} 148 | index = [] 149 | for group, group_data in data.groupby(grouping_fields): 150 | result["split"].extend( 151 | AASplitter._inner_function(group_data, random_state, control_size) 152 | ) 153 | index.extend(list(group_data.index)) 154 | return Dataset.from_dict(result, index=index, roles={"split": TreatmentRole()}) 155 | 156 | def execute(self, data: ExperimentData) -> ExperimentData: 157 | grouping_fields = data.ds.search_columns(StratificationRole()) 158 | result = self.calc( 159 | data.ds, 160 | random_state=self.random_state, 161 | control_size=self.control_size, 162 | grouping_fields=grouping_fields, 163 | ) 164 | if isinstance(result, Dataset): 165 | result = result.replace_roles({"split": AdditionalTreatmentRole()}) 166 | return self._set_value(data, result) 167 | 168 | 169 | # 170 | # class AASplitterWithStratification(AASplitter): 171 | # def __init__( 172 | # self, 173 | # control_size: float = 0.5, 174 | # random_state: Optional[int] = None, 175 | # # key: Any = "", 176 | # ): 177 | # super().__init__(control_size, random_state, key) 178 | # 179 | # def calc(self, data: Dataset): 180 | # stratification_columns = data.get_columns_by_roles(StratificationRole()) 181 | # 182 | # groups = data.groupby(stratification_columns) 183 | # result = Dataset._create_empty() 184 | # for _, gd in groups: 185 | # ged = ExperimentData(gd) 186 | # ged = super().execute(ged) 187 | # 188 | # result = ged if result is None else result.append(ged) 189 | # return result["group"] 190 | 191 | 192 | # As idea 193 | # class SplitterAAMulti(ExperimentMulti): 194 | # def execute(self, data): 195 | # raise NotImplementedError 196 | -------------------------------------------------------------------------------- /hypex/reporters/aa.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import contextlib 4 | from typing import Any, ClassVar 5 | 6 | from ..comparators import Chi2Test, GroupDifference, GroupSizes, KSTest, TTest 7 | from ..dataset import Dataset, ExperimentData, InfoRole, StatisticRole 8 | from ..splitters import AASplitter, AASplitterWithStratification 9 | from ..utils import ID_SPLIT_SYMBOL, ExperimentDataEnum, NotFoundInExperimentDataError 10 | from .abstract import Reporter, TestDictReporter 11 | 12 | 13 | class OneAADictReporter(TestDictReporter): 14 | tests: ClassVar[list] = [TTest, KSTest, Chi2Test] 15 | 16 | @staticmethod 17 | def convert_flat_dataset(data: dict) -> Dataset: 18 | struct_dict = OneAADictReporter._get_struct_dict(data) 19 | return OneAADictReporter._convert_struct_dict_to_dataset(struct_dict) 20 | 21 | @staticmethod 22 | def get_splitter_id(data: ExperimentData): 23 | for c in [AASplitter, AASplitterWithStratification]: 24 | with contextlib.suppress(NotFoundInExperimentDataError): 25 | return data.get_one_id(c, ExperimentDataEnum.additional_fields) 26 | 27 | def extract_group_difference(self, data: ExperimentData) -> dict[str, Any]: 28 | group_difference_ids = data.get_ids(GroupDifference)[GroupDifference.__name__][ 29 | ExperimentDataEnum.analysis_tables.value 30 | ] 31 | return self._extract_from_comparators(data, group_difference_ids) 32 | 33 | def extract_group_sizes(self, data: ExperimentData) -> dict[str, Any]: 34 | group_sizes_id = data.get_one_id(GroupSizes, ExperimentDataEnum.analysis_tables) 35 | return self._extract_from_comparators(data, [group_sizes_id]) 36 | 37 | def extract_analyzer_data(self, data: ExperimentData) -> dict[str, Any]: 38 | analyzer_id = data.get_one_id( 39 | "OneAAStatAnalyzer", ExperimentDataEnum.analysis_tables 40 | ) 41 | return self.extract_from_one_row_dataset(data.analysis_tables[analyzer_id]) 42 | 43 | def extract_data_from_analysis_tables(self, data: ExperimentData) -> dict[str, Any]: 44 | result = {} 45 | result.update(self.extract_group_difference(data)) 46 | # result.update(self.extract_group_sizes(data)) 47 | result.update(self.extract_tests(data)) 48 | result.update(self.extract_analyzer_data(data)) 49 | if self.front: 50 | result = self.rename_passed(result) 51 | return result 52 | 53 | def report(self, data: ExperimentData) -> dict[str, Any]: 54 | result = { 55 | "splitter_id": self.get_splitter_id(data), 56 | } 57 | result.update(self.extract_data_from_analysis_tables(data)) 58 | return result 59 | 60 | 61 | class AADatasetReporter(OneAADictReporter): 62 | def report(self, data: ExperimentData): 63 | front_buffer = self.front 64 | self.front = False 65 | dict_report = super().report(data) 66 | self.front = front_buffer 67 | return self.convert_flat_dataset(dict_report) 68 | 69 | 70 | class AAPassedReporter(Reporter): 71 | @staticmethod 72 | def _reformat_aa_score_table(table: Dataset) -> Dataset: 73 | result = {} 74 | for ind in table.index: 75 | splitted_index = ind.split(ID_SPLIT_SYMBOL) 76 | row_index = f"{splitted_index[0]}{ID_SPLIT_SYMBOL}{splitted_index[-1]}" 77 | value = table.get_values(ind, "pass") 78 | if row_index not in result: 79 | result[row_index] = {splitted_index[1]: value} 80 | else: 81 | result[row_index][splitted_index[1]] = value 82 | result = Dataset.from_dict(result, roles={}).transpose() * 1 83 | return result 84 | 85 | @staticmethod 86 | def _reformat_best_split_table(table: Dataset) -> Dataset: 87 | passed = table.loc[:, [c for c in table.columns if (c.endswith("pass"))]] 88 | new_index = table.apply( 89 | lambda x: f"{x['feature']}{ID_SPLIT_SYMBOL}{x['group']}", 90 | {"index": InfoRole()}, 91 | axis=1, 92 | ) 93 | passed.index = new_index.get_values(column="index") 94 | passed = passed.rename( 95 | names={c: c[: c.rfind("pass") - 1] for c in passed.columns} 96 | ) 97 | passed = passed.replace("OK", 1).replace("NOT OK", 0) 98 | passed = passed.astype({c: int for c in passed.columns}, errors="ignore") 99 | return passed 100 | 101 | def _detect_pass(self, analyzer_tables: dict[str, Dataset]): 102 | score_table = self._reformat_aa_score_table(analyzer_tables["aa score"]) 103 | best_split_table = self._reformat_best_split_table( 104 | analyzer_tables["best split statistics"] 105 | ) 106 | resume_table = score_table * best_split_table 107 | resume_table = resume_table.apply( 108 | lambda x: "OK" if x.sum() > 0 else "NOT OK", 109 | axis=1, 110 | role={"result": StatisticRole()}, 111 | ) 112 | result = score_table.merge( 113 | best_split_table, 114 | suffixes=(" aa test", " best split"), 115 | left_index=True, 116 | right_index=True, 117 | ) 118 | result = result.merge(resume_table, left_index=True, right_index=True) 119 | result.roles = {c: r.__class__(str) for c, r in result.roles.items()} 120 | result = ( 121 | result.replace(0, "NOT OK") 122 | .replace(1, "OK") 123 | .replace("0", "NOT OK") 124 | .replace("1", "OK") 125 | ) 126 | splitted_index = [str(i).split(ID_SPLIT_SYMBOL) for i in result.index] 127 | result.add_column([i[0] for i in splitted_index], role={"feature": InfoRole()}) 128 | result.add_column([i[1] for i in splitted_index], role={"group": InfoRole()}) 129 | result.index = range(len(splitted_index)) 130 | return result 131 | 132 | def report(self, data: ExperimentData) -> Dataset: 133 | analyser_ids = data.get_ids( 134 | "AAScoreAnalyzer", ExperimentDataEnum.analysis_tables 135 | ) 136 | analyser_tables = { 137 | id_[id_.rfind(ID_SPLIT_SYMBOL) + 1 :]: data.analysis_tables[id_] 138 | for id_ in analyser_ids["AAScoreAnalyzer"][ 139 | ExperimentDataEnum.analysis_tables.value 140 | ] 141 | } 142 | if not analyser_tables["aa score"]: 143 | print("AA test cannot be performed as none of the analyzers passed") 144 | return None 145 | result = self._detect_pass(analyser_tables) 146 | stats_cols = ["feature", "group", "control mean", "test mean", "difference", "difference %"] 147 | differences = analyser_tables["best split statistics"].loc[ 148 | :, 149 | [ 150 | col 151 | for col in stats_cols 152 | if col in analyser_tables["best split statistics"].columns 153 | ], 154 | ] 155 | result = result.merge(differences, on=["feature", "group"], how="left") 156 | result = result[ 157 | ["feature", "group"] 158 | + [c for c in result.columns if c not in ["feature", "group"]] 159 | ] 160 | numeric_cols = ["control mean", "test mean", "difference", "difference %"] 161 | for col in numeric_cols: 162 | result.data[col] = result.data[col].astype(float).round(6) 163 | return result 164 | 165 | 166 | class AABestSplitReporter(Reporter): 167 | def report(self, data: ExperimentData): 168 | best_split_id = next( 169 | (c for c in data.additional_fields.columns if c.endswith("best")), [] 170 | ) 171 | markers = data.additional_fields.loc[:, best_split_id] 172 | markers = markers.rename({markers.columns[0]: "split"}) 173 | return data.ds.merge(markers, left_index=True, right_index=True) 174 | -------------------------------------------------------------------------------- /tests/test_tutorials.py: -------------------------------------------------------------------------------- 1 | # starts with HYPEX-dir: PYTHONPATH=$(pwd) pytest 2 | import random 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import pandas.testing as pdt 7 | import pytest 8 | 9 | from hypex import AATest, ABTest, Matching 10 | from hypex.dataset import ( 11 | Dataset, 12 | FeatureRole, 13 | InfoRole, 14 | StratificationRole, 15 | TargetRole, 16 | TreatmentRole, 17 | ) 18 | 19 | 20 | @pytest.fixture 21 | def aa_data(): 22 | return [ 23 | Dataset( 24 | roles={ 25 | "user_id": InfoRole(int), 26 | "treat": TreatmentRole(int), 27 | "pre_spends": TargetRole(), 28 | "post_spends": TargetRole(), 29 | "gender": StratificationRole(str), 30 | }, 31 | data="examples/tutorials/data.csv", 32 | ), 33 | Dataset( 34 | roles={ 35 | "user_id": InfoRole(int), 36 | "treat": TreatmentRole(int), 37 | "pre_spends": TargetRole(), 38 | "post_spends": TargetRole(), 39 | "gender": TargetRole(str), 40 | }, 41 | data="examples/tutorials/data.csv", 42 | ), 43 | ] 44 | 45 | 46 | @pytest.fixture 47 | def ab_data(): 48 | random.seed(7) 49 | data = Dataset( 50 | roles={ 51 | "user_id": InfoRole(int), 52 | "treat": TreatmentRole(), 53 | "pre_spends": TargetRole(), 54 | "post_spends": TargetRole(), 55 | "gender": TargetRole(), 56 | }, 57 | data="examples/tutorials/data.csv", 58 | ) 59 | data["treat"] = [random.choice([0, 1, 2]) for _ in range(len(data))] 60 | return data 61 | 62 | 63 | @pytest.fixture 64 | def matching_data(): 65 | data = Dataset( 66 | roles={ 67 | "user_id": InfoRole(int), 68 | "treat": TreatmentRole(int), 69 | "post_spends": TargetRole(float), 70 | }, 71 | data="examples/tutorials/data.csv", 72 | default_role=FeatureRole(), 73 | ) 74 | data = data.fillna(method="bfill") 75 | return data 76 | 77 | 78 | def test_aatest(aa_data): 79 | mapping = { 80 | "aa-casual": AATest(n_iterations=10), 81 | "aa-rs": AATest(random_states=[56, 72, 2, 43]), 82 | "aa-strat": AATest(random_states=[56, 72, 2, 43], stratification=True), 83 | "aa-sample": AATest(n_iterations=10, sample_size=0.3), 84 | "aa-cat_target": AATest(n_iterations=10), 85 | "aa-equal_var": AATest(n_iterations=10, t_test_equal_var=False), 86 | } 87 | 88 | mapping_resume = { 89 | "aa-casual": pd.DataFrame( 90 | { 91 | "TTest aa test": {0: "OK", 1: "OK"}, 92 | "KSTest aa test": {0: "NOT OK", 1: "OK"}, 93 | "TTest best split": {0: "OK", 1: "OK"}, 94 | "KSTest best split": {0: "OK", 1: "OK"}, 95 | "result": {0: "OK", 1: "OK"}, 96 | } 97 | ), 98 | "aa-rs": pd.DataFrame( 99 | { 100 | "TTest aa test": {0: "OK", 1: "OK"}, 101 | "KSTest aa test": {0: "NOT OK", 1: "OK"}, 102 | "TTest best split": {0: "OK", 1: "OK"}, 103 | "KSTest best split": {0: "OK", 1: "OK"}, 104 | "result": {0: "OK", 1: "OK"}, 105 | } 106 | ), 107 | "aa-strat": pd.DataFrame( 108 | { 109 | "TTest aa test": {0: "OK", 1: "NOT OK"}, 110 | "KSTest aa test": {0: "OK", 1: "NOT OK"}, 111 | "TTest best split": {0: "OK", 1: "OK"}, 112 | "KSTest best split": {0: "OK", 1: "OK"}, 113 | "result": {0: "OK", 1: "NOT OK"}, 114 | } 115 | ), 116 | "aa-sample": pd.DataFrame( 117 | { 118 | "TTest aa test": {0: "OK", 1: "OK"}, 119 | "KSTest aa test": {0: "OK", 1: "OK"}, 120 | "TTest best split": {0: "NOT OK", 1: "NOT OK"}, 121 | "KSTest best split": {0: "OK", 1: "OK"}, 122 | "result": {0: "OK", 1: "OK"}, 123 | } 124 | ), 125 | "aa-cat_target": pd.DataFrame( 126 | { 127 | "TTest aa test": ["OK", "OK", np.nan], 128 | "KSTest aa test": ["NOT OK", "OK", np.nan], 129 | "Chi2Test aa test": [np.nan, np.nan, "OK"], 130 | "TTest best split": ["OK", "OK", np.nan], 131 | "KSTest best split": ["OK", "OK", np.nan], 132 | "Chi2Test best split": [np.nan, np.nan, "OK"], 133 | "result": ["OK", "OK", "OK"], 134 | } 135 | ), 136 | "aa-equal_var": pd.DataFrame( 137 | { 138 | "TTest aa test": {0: "OK", 1: "OK"}, 139 | "KSTest aa test": {0: "NOT OK", 1: "OK"}, 140 | "TTest best split": {0: "OK", 1: "OK"}, 141 | "KSTest best split": {0: "OK", 1: "OK"}, 142 | "result": {0: "OK", 1: "OK"}, 143 | } 144 | ), 145 | } 146 | 147 | for test_name in mapping.keys(): 148 | print(test_name) 149 | if test_name == "aa-cat_target": 150 | res = mapping[test_name].execute(aa_data[1]) 151 | else: 152 | res = mapping[test_name].execute(aa_data[0]) 153 | actual_data = res.resume.data.iloc[:, 2:-4] 154 | expected_data = mapping_resume[test_name] 155 | pdt.assert_frame_equal(expected_data, actual_data, check_dtype=False) 156 | 157 | 158 | def test_abtest(ab_data): 159 | mapping = { 160 | "ab-casual": ABTest(), 161 | "ab-additional": ABTest(additional_tests=["t-test", "u-test", "chi2-test"]), 162 | "ab-n": ABTest(multitest_method="bonferroni"), 163 | } 164 | 165 | mapping_resume = { 166 | "ab-casual": pd.DataFrame( 167 | {"TTest pass": {0: "NOT OK", 1: "NOT OK", 2: "NOT OK", 3: "NOT OK"}} 168 | ), 169 | "ab-additional": pd.DataFrame( 170 | { 171 | "TTest pass": { 172 | 0: "NOT OK", 173 | 1: "NOT OK", 174 | 2: "NOT OK", 175 | 3: "NOT OK", 176 | 4: 0, 177 | 5: 0, 178 | }, 179 | "UTest pass": { 180 | 0: "NOT OK", 181 | 1: "NOT OK", 182 | 2: "NOT OK", 183 | 3: "NOT OK", 184 | 4: 0, 185 | 5: 0, 186 | }, 187 | "Chi2Test pass": {0: 0, 1: 0, 2: 0, 3: 0, 4: "NOT OK", 5: "NOT OK"}, 188 | } 189 | ), 190 | "ab-n": pd.DataFrame( 191 | {"TTest pass": {0: "NOT OK", 1: "NOT OK", 2: "NOT OK", 3: "NOT OK"}} 192 | ), 193 | } 194 | 195 | for test_name in mapping.keys(): 196 | res = mapping[test_name].execute(ab_data) 197 | actual_data = ( 198 | res.resume.data.fillna(0) 199 | .apply(pd.to_numeric, errors="ignore") 200 | .iloc[:, 6::2] 201 | ) 202 | expected_data = mapping_resume[test_name] 203 | pdt.assert_frame_equal(expected_data, actual_data, check_dtype=False) 204 | 205 | 206 | def test_matchingtest(matching_data): 207 | mapping = { 208 | "matching": Matching(), 209 | "matching-atc": Matching(metric="atc"), 210 | "matching-att": Matching(metric="att"), 211 | "matching-l2": Matching(distance="l2", metric="att"), 212 | "matching-faiss-auto": Matching(distance="l2", faiss_mode="auto"), 213 | "matching-faiss_base": Matching(distance="mahalanobis", faiss_mode="base"), 214 | "matching-n-neighbors": Matching(n_neighbors=2), 215 | } 216 | 217 | for test_name in mapping.keys(): 218 | res = mapping[test_name].execute(matching_data) 219 | actual_data = res.resume.data 220 | assert actual_data.index.isin(["ATT", "ATC", "ATE"]).all() 221 | assert all( 222 | actual_data.iloc[:, :-1].dtypes.apply( 223 | lambda x: pd.api.types.is_numeric_dtype(x) 224 | ) 225 | ), "Есть нечисловые колонки!" 226 | -------------------------------------------------------------------------------- /hypex/extensions/statsmodels.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import numpy as np 4 | from scipy.stats import norm # type: ignore 5 | from statsmodels.stats.multitest import multipletests # type: ignore 6 | 7 | from ..dataset import Dataset, DatasetAdapter, StatisticRole 8 | from ..utils import ID_SPLIT_SYMBOL, ABNTestMethodsEnum 9 | from .abstract import Extension 10 | 11 | 12 | class MultiTest(Extension): 13 | def __init__(self, method: ABNTestMethodsEnum, alpha: float = 0.05): 14 | self.method = method 15 | self.alpha = alpha 16 | super().__init__() 17 | 18 | def _calc_pandas(self, data: Dataset, **kwargs): 19 | p_values = data.data.values.flatten() 20 | new_pvalues = multipletests( 21 | p_values, method=self.method.value, alpha=self.alpha, **kwargs 22 | ) 23 | return DatasetAdapter.to_dataset( 24 | { 25 | "field": [i.split(ID_SPLIT_SYMBOL)[2] for i in data.index], 26 | "test": [i.split(ID_SPLIT_SYMBOL)[0] for i in data.index], 27 | "old p-value": p_values, 28 | "new p-value": new_pvalues[1], 29 | "correction": [ 30 | j / i if j != 0 else 0.0 for i, j in zip(new_pvalues[1], p_values) 31 | ], 32 | "rejected": new_pvalues[0], 33 | }, 34 | StatisticRole(), 35 | ) 36 | 37 | 38 | class MultitestQuantile(Extension): 39 | def __init__( 40 | self, 41 | alpha: float = 0.05, 42 | iteration_size: int = 20000, 43 | equal_variance: bool = True, 44 | random_state: int | None = None, 45 | ): 46 | self.alpha = alpha 47 | self.iteration_size = iteration_size 48 | self.equal_variance = equal_variance 49 | self.random_state = random_state 50 | super().__init__() 51 | 52 | def _calc_pandas(self, data: Dataset, **kwargs): 53 | group_field = kwargs.get("group_field") 54 | target_field = kwargs.get("target_field") 55 | quantiles = kwargs.get("quantiles") 56 | num_samples = len(data.unique()[group_field]) 57 | sample_size = len(data) 58 | grouped_data = data.groupby(by=group_field, fields_list=target_field) 59 | means = [sample[1].agg("mean") for sample in grouped_data] 60 | variances = [ 61 | sample[1].agg("var") * sample_size / (sample_size - 1) 62 | for sample in grouped_data 63 | ] 64 | if num_samples != len(means) or num_samples != len(variances): 65 | num_samples = min(num_samples, len(means), len(variances)) 66 | if type(quantiles) is float: 67 | quantiles = np.full(num_samples, quantiles).tolist() 68 | 69 | quantiles = quantiles or self.quantile_of_marginal_distribution( 70 | num_samples=num_samples, 71 | quantile_level=1 - self.alpha / num_samples, 72 | variances=variances, 73 | ) 74 | for j in range(num_samples): 75 | min_t_value = np.inf 76 | for i in range(num_samples): 77 | if i != j: 78 | t_value = ( 79 | np.sqrt(sample_size) 80 | * (means[j] - means[i]) 81 | / np.sqrt(variances[j] + variances[i]) 82 | ) 83 | min_t_value = min(min_t_value, t_value) 84 | if min_t_value > quantiles[j]: 85 | return DatasetAdapter.to_dataset( 86 | {"field": target_field, "accepted hypothesis": j + 1}, 87 | StatisticRole(), 88 | ) 89 | return DatasetAdapter.to_dataset( 90 | {"field": target_field, "accepted hypothesis": 0}, StatisticRole() 91 | ) 92 | 93 | def quantile_of_marginal_distribution( 94 | self, 95 | num_samples: int, 96 | quantile_level: float, 97 | variances: list[float] | None = None, 98 | ) -> list[float]: 99 | if variances is None: 100 | self.equal_variance = True 101 | num_samples_hyp = 1 if self.equal_variance else num_samples 102 | quantiles = [] 103 | for j in range(num_samples_hyp): 104 | t_values = [] 105 | random_samples = norm.rvs( 106 | size=[self.iteration_size, num_samples], random_state=self.random_state 107 | ) 108 | for sample in random_samples: 109 | min_t_value = np.inf 110 | for i in range(num_samples): 111 | if i != j: 112 | if self.equal_variance: 113 | t_value = (sample[j] - sample[i]) / np.sqrt(2) 114 | else: 115 | if variances is None: 116 | raise ValueError("variances is needed for execution") 117 | t_value = sample[j] / np.sqrt( 118 | 1 + variances[i] / variances[j] 119 | ) - sample[i] / np.sqrt(1 + variances[j] / variances[i]) 120 | min_t_value = min(min_t_value, t_value) 121 | t_values.append(min_t_value) 122 | quantiles.append(np.quantile(t_values, quantile_level)) 123 | return ( 124 | np.full(num_samples, quantiles[0]).tolist() 125 | if self.equal_variance 126 | else quantiles 127 | ) 128 | 129 | def min_sample_size( 130 | self, 131 | num_samples: int, 132 | mde: float, 133 | variances: list[float] | float, 134 | power: float = 0.2, 135 | quantile_1: float | list[float] | None = None, 136 | quantile_2: float | list[float] | None = None, 137 | initial_estimate: int = 0, 138 | iteration_size: int = 3000, 139 | ): 140 | if isinstance(quantile_1, float): 141 | quantile_1 = np.full(num_samples, quantile_1).tolist() 142 | if isinstance(quantile_1, float): 143 | quantile_2 = np.full(num_samples, quantile_2).tolist() 144 | 145 | quantile_1 = quantile_1 or self.quantile_of_marginal_distribution( 146 | num_samples=num_samples, 147 | quantile_level=1 - self.alpha / num_samples, 148 | variances=variances if isinstance(variances, list) else [variances], 149 | ) 150 | quantile_2 = quantile_2 or self.quantile_of_marginal_distribution( 151 | num_samples=num_samples, quantile_level=power 152 | ) 153 | 154 | if self.equal_variance: 155 | return int(2 * variances * ((quantile_1[0] - quantile_2[0]) / mde) ** 2) + 1 156 | else: 157 | sizes = [] 158 | for index in range(num_samples): 159 | size = initial_estimate 160 | current_power = 0 161 | while current_power < 1 - power: 162 | size += 100 163 | current_power = 0 164 | total_samples = norm.rvs( 165 | size=[iteration_size, num_samples], 166 | random_state=self.random_state, 167 | ) 168 | for sample in total_samples: 169 | min_t_value = np.inf 170 | for i in range(num_samples): 171 | if i != index: 172 | t_value = ( 173 | sample[index] 174 | / np.sqrt(1 + variances[i] / variances[index]) 175 | - sample[i] 176 | / np.sqrt(1 + variances[index] / variances[i]) 177 | + mde 178 | * np.sqrt(size / (variances[index] + variances[i])) 179 | ) 180 | min_t_value = min(min_t_value, t_value) 181 | if min_t_value > quantile_1[index]: 182 | current_power += 1 183 | current_power /= iteration_size 184 | sizes.append(size) 185 | return {"min sample size": np.max(sizes)} 186 | -------------------------------------------------------------------------------- /hypex/dataset/abstract.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import copy 4 | import json # type: ignore 5 | from abc import ABC 6 | from typing import Any, Iterable 7 | 8 | import pandas as pd # type: ignore 9 | 10 | from ..utils import BackendsEnum, RoleColumnError 11 | from .backends import PandasDataset 12 | from .roles import ABCRole, DefaultRole, default_roles 13 | 14 | 15 | def parse_roles(roles: dict) -> dict[str | int] | ABCRole: 16 | new_roles = {} 17 | roles = roles or {} 18 | for role in roles: 19 | r = default_roles.get(role, role) 20 | if isinstance(roles[role], list): 21 | for i in roles[role]: 22 | new_roles[i] = copy.deepcopy(r) 23 | else: 24 | new_roles[roles[role]] = copy.deepcopy(r) 25 | return new_roles or roles 26 | 27 | 28 | class DatasetBase(ABC): 29 | @staticmethod 30 | def _select_backend_from_data(data): 31 | return PandasDataset(data) 32 | 33 | @staticmethod 34 | def _select_backend_from_str(data, backend): 35 | if backend == BackendsEnum.pandas: 36 | return PandasDataset(data) 37 | if backend is None: 38 | return PandasDataset(data) 39 | raise TypeError("Backend must be an instance of BackendsEnum") 40 | 41 | def _set_all_roles(self, roles): 42 | keys = list(roles.keys()) 43 | for column in self.columns: 44 | if column not in keys: 45 | roles[column] = copy.deepcopy(self.default_role) or DefaultRole() 46 | return roles 47 | 48 | def _set_empty_types(self, roles): 49 | for column, role in roles.items(): 50 | if role.data_type is None: 51 | role.data_type = self._backend.get_column_type(column) 52 | self._backend = self._backend.update_column_type(column, role.data_type) 53 | 54 | def __init__( 55 | self, 56 | roles: dict[ABCRole, list[str] | str] | dict[str, ABCRole], 57 | data: pd.DataFrame | str | None = None, 58 | backend: BackendsEnum | None = None, 59 | default_role: ABCRole | None = None, 60 | ): 61 | self._backend = ( 62 | self._select_backend_from_str(data, backend) 63 | if backend 64 | else self._select_backend_from_data(data) 65 | ) 66 | self.default_role = default_role 67 | roles = ( 68 | parse_roles(roles) 69 | if any(isinstance(role, ABCRole) for role in roles.keys()) 70 | else roles 71 | ) 72 | if any(not isinstance(role, ABCRole) for role in roles.values()): 73 | raise TypeError("Roles must be instances of ABCRole type") 74 | if data is not None and any( 75 | i not in self._backend.columns for i in list(roles.keys()) 76 | ): 77 | raise RoleColumnError(list(roles.keys()), self._backend.columns) 78 | if data is not None: 79 | roles = self._set_all_roles(roles) 80 | self._set_empty_types(roles) 81 | self._roles: dict[str, ABCRole] = roles 82 | self._tmp_roles: ( 83 | dict[ABCRole, list[str] | str] | dict[list[str] | str] | ABCRole 84 | ) = {} 85 | 86 | def __repr__(self): 87 | return self.data.__repr__() 88 | 89 | def _repr_html_(self): 90 | return self.data._repr_html_() 91 | 92 | def __len__(self): 93 | return self._backend.__len__() 94 | 95 | def search_columns( 96 | self, 97 | roles: ABCRole | Iterable[ABCRole], 98 | tmp_role=False, 99 | search_types: list | None = None, 100 | ) -> list[str]: 101 | roles = roles if isinstance(roles, Iterable) else [roles] 102 | roles_for_search = self._tmp_roles if tmp_role else self.roles 103 | return [ 104 | str(column) 105 | for column, role in roles_for_search.items() 106 | if any( 107 | isinstance(r, role.__class__) 108 | and (not search_types or role.data_type in search_types) 109 | for r in roles 110 | ) 111 | ] 112 | 113 | def replace_roles( 114 | self, 115 | new_roles_map: dict[ABCRole | str] | ABCRole, 116 | tmp_role: bool = False, 117 | auto_roles_types: bool = False, 118 | ): 119 | new_roles_map = parse_roles( 120 | { 121 | role: ( 122 | self.search_columns(column, tmp_role) 123 | if isinstance(column, ABCRole) 124 | else column 125 | ) 126 | for column, role in new_roles_map.items() 127 | } 128 | ) 129 | 130 | new_roles = { 131 | column: new_roles_map[column] if column in new_roles_map else role 132 | for column, role in self.roles.items() 133 | } 134 | 135 | if tmp_role: 136 | self._tmp_roles = new_roles 137 | else: 138 | self.roles = new_roles 139 | if auto_roles_types: 140 | self._set_empty_types(new_roles_map) 141 | 142 | return self 143 | 144 | @property 145 | def index(self): 146 | return self._backend.index 147 | 148 | @property 149 | def data(self): 150 | return self._backend.data 151 | 152 | @property 153 | def roles(self): 154 | return self._roles 155 | 156 | @roles.setter 157 | def roles(self, value): 158 | self._set_roles(new_roles_map=value, temp_role=False) 159 | 160 | @data.setter 161 | def data(self, value): 162 | self._backend.data = value 163 | 164 | @property 165 | def columns(self): 166 | return self._backend.columns 167 | 168 | @property 169 | def shape(self): 170 | return self._backend.shape 171 | 172 | @property 173 | def tmp_roles(self): 174 | return self._tmp_roles 175 | 176 | @tmp_roles.setter 177 | def tmp_roles(self, value): 178 | self._set_roles(new_roles_map=value, temp_role=True) 179 | self._set_empty_types(self._tmp_roles) 180 | 181 | def to_dict(self): 182 | return { 183 | "backend": self._backend.name, 184 | "roles": { 185 | "role_names": list(map(lambda x: x, list(self.roles.keys()))), 186 | "columns": list(self.roles.values()), 187 | }, 188 | "data": self._backend.to_dict(), 189 | } 190 | 191 | def to_records(self): 192 | return self._backend.to_records() 193 | 194 | def to_json(self, filename: str | None = None): 195 | if not filename: 196 | return json.dumps(self.to_dict()) 197 | with open(filename, "w") as file: 198 | json.dump(self.to_dict(), file) 199 | 200 | @property 201 | def backend(self): 202 | return self._backend 203 | 204 | def get_values( 205 | self, 206 | row: str | None = None, 207 | column: str | None = None, 208 | ) -> Any: 209 | return self._backend.get_values(row=row, column=column) 210 | 211 | def iget_values( 212 | self, 213 | row: int | None = None, 214 | column: int | None = None, 215 | ) -> Any: 216 | return self._backend.iget_values(row=row, column=column) 217 | 218 | def _set_roles( 219 | self, 220 | new_roles_map: dict[ABCRole, list[str] | str] | dict[list[str] | str] | ABCRole, 221 | temp_role: bool = False, 222 | ): 223 | if not new_roles_map: 224 | if not temp_role: 225 | return self.roles 226 | else: 227 | self._tmp_roles = {} 228 | return self 229 | 230 | keys, values = list(new_roles_map.keys()), list(new_roles_map.values()) 231 | roles, columns_sets = ( 232 | (keys, values) if isinstance(keys[0], ABCRole) else (values, keys) 233 | ) 234 | 235 | new_roles = {} 236 | for role, columns in zip(roles, columns_sets): 237 | if isinstance(columns, list): 238 | for column in columns: 239 | new_roles[column] = copy.deepcopy(role) 240 | else: 241 | new_roles[columns] = copy.deepcopy(role) 242 | 243 | if temp_role: 244 | self._tmp_roles = new_roles 245 | else: 246 | self._roles = new_roles 247 | 248 | return self 249 | --------------------------------------------------------------------------------