├── .python-version ├── psyke ├── genetic │ ├── __init__.py │ ├── fgin │ │ └── __init__.py │ └── gin │ │ └── __init__.py ├── extraction │ ├── hypercubic │ │ ├── gridrex │ │ │ └── __init__.py │ │ ├── utils.py │ │ ├── cosmik │ │ │ └── __init__.py │ │ ├── strategy.py │ │ ├── creepy │ │ │ └── __init__.py │ │ ├── gridex │ │ │ └── __init__.py │ │ ├── divine │ │ │ └── __init__.py │ │ ├── ginger │ │ │ └── __init__.py │ │ ├── hex │ │ │ └── __init__.py │ │ └── iter │ │ │ └── __init__.py │ ├── __init__.py │ ├── real │ │ ├── utils.py │ │ └── __init__.py │ ├── cart │ │ ├── FairTreePredictor.py │ │ ├── __init__.py │ │ ├── CartPredictor.py │ │ └── FairTree.py │ └── trepan │ │ ├── utils.py │ │ └── __init__.py ├── utils │ ├── sorted.py │ ├── __init__.py │ ├── metrics.py │ ├── dataframe.py │ └── plot.py ├── clustering │ ├── __init__.py │ ├── utils.py │ ├── cream │ │ └── __init__.py │ └── exact │ │ └── __init__.py ├── tuning │ ├── crash │ │ └── __init__.py │ ├── orchid │ │ └── __init__.py │ ├── __init__.py │ └── pedro │ │ └── __init__.py └── hypercubepredictor.py ├── test ├── unit │ ├── utils │ │ ├── __init__.py │ │ ├── test_simplify.py │ │ ├── test_simplify_formatter.py │ │ └── test_prune.py │ ├── clustering │ │ └── __init__.py │ ├── extraction │ │ ├── __init__.py │ │ ├── cart │ │ │ ├── __init__.py │ │ │ ├── test_cart.py │ │ │ └── test_simplified_cart.py │ │ ├── real │ │ │ ├── __init__.py │ │ │ ├── test_real.py │ │ │ └── test_rule.py │ │ ├── hypercubic │ │ │ ├── __init__.py │ │ │ ├── iter │ │ │ │ ├── __init__.py │ │ │ │ └── test_iter.py │ │ │ └── gridex │ │ │ │ ├── __init__.py │ │ │ │ └── test_gridex.py │ │ └── trepan │ │ │ ├── __init__.py │ │ │ ├── test_trepan.py │ │ │ ├── test_split.py │ │ │ └── test_node.py │ └── __init__.py └── __init__.py ├── .img ├── logo.png └── logo-wide.png ├── MANIFEST.in ├── pyproject.toml ├── .gitmodules ├── CONTRIBUTORS ├── .github ├── workflows │ ├── todos.yml │ ├── dockerify.yml │ ├── deploy.yml │ └── check.yml └── scripts │ └── retry.sh ├── requirements.txt ├── Dockerfile ├── renovate.json ├── .gitignore ├── CITATION.md ├── README.md ├── setup.py └── LICENSE /.python-version: -------------------------------------------------------------------------------- 1 | 3.11.14 2 | -------------------------------------------------------------------------------- /psyke/genetic/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/unit/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/unit/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/unit/extraction/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /test/unit/extraction/cart/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /test/unit/extraction/real/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /test/unit/extraction/hypercubic/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /test/unit/extraction/hypercubic/iter/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/unit/extraction/trepan/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /test/unit/extraction/hypercubic/gridex/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/psykei/psyke-python/HEAD/.img/logo.png -------------------------------------------------------------------------------- /.img/logo-wide.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/psykei/psyke-python/HEAD/.img/logo-wide.png -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include VERSION 2 | exclude test/* 3 | exclude demo/* 4 | exclude main.py 5 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "test/resources"] 2 | path = test/resources 3 | url = https://github.com/psykei/psyke-pytest.git 4 | -------------------------------------------------------------------------------- /CONTRIBUTORS: -------------------------------------------------------------------------------- 1 | Federico Sabbatini (f.sabbatini1@campus.uniurb.it, federico.sabbatini992@gmail.com) 2 | Giovanni Ciatto (giovanni.ciatto@unibo.it, giovanni.ciatto@gmail.com) 3 | Matteo Magnini (matteo.magnini@unibo.it, matteo.magnini00@gmail.com) 4 | -------------------------------------------------------------------------------- /.github/workflows/todos.yml: -------------------------------------------------------------------------------- 1 | name: "TODOs finder" 2 | on: 3 | push: 4 | branches-ignore: 5 | - 'autodelivery**' 6 | - 'bump-**' 7 | - 'renovate/**' 8 | - 'dependabot/**' 9 | jobs: 10 | build: 11 | runs-on: "ubuntu-latest" 12 | steps: 13 | - uses: "actions/checkout@master" 14 | - name: "TODO to Issue" 15 | uses: "alstr/todo-to-issue-action@v5.1.13" 16 | id: "todo" 17 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==2.3.5 2 | pandas==2.3.3 3 | scikit-learn~=1.8.0 4 | matplotlib==3.10.8 5 | sympy==1.14.0 6 | parameterized==0.9.0 7 | kneed==0.8.5 8 | deap==1.4.3 9 | scikit-fuzzy==0.5.0 10 | 2ppy==0.4.1 11 | build==1.3.0 12 | twine==6.2.0 13 | setuptools==80.9.0 14 | onnx==1.20.0 15 | onnxruntime==1.23.2 16 | onnxconverter-common==1.16.0 17 | skl2onnx==1.19.1 18 | joblib>=1.5.1 19 | keras>=3.10.0 20 | tensorflow==2.20.0 21 | protobuf>=3.20.3 22 | ml-dtypes>=0.3.1 23 | -------------------------------------------------------------------------------- /.github/scripts/retry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DT=${2:-${RETRY_TIME:-5m}} 4 | MAX=${3:-${MAX_RETRIES:-3}} 5 | 6 | for N in `seq 1 $MAX`; do 7 | echo "Attempt $N/$MAX: $1" 8 | eval $1; 9 | RESULT=$? 10 | if [[ $RESULT -eq 0 ]]; then 11 | exit 0 12 | fi 13 | if [[ $N -lt $MAX ]]; then 14 | echo "Failed attempt $N/$MAX. Waiting $DT" 15 | sleep $DT 16 | else 17 | echo "Failed attempt $N/$MAX." 18 | exit $RESULT 19 | fi 20 | done 21 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11 2 | ARG PSYKE_VERSION 3 | EXPOSE 8888 4 | RUN apt update; apt install -y -q openjdk-17-jdk 5 | RUN pip install jupyter 6 | RUN pip install psyke==$PSYKE_VERSION 7 | RUN mkdir -p /root/.jupyter 8 | ENV JUPYTER_CONF_FILE /root/.jupyter/jupyter_notebook_config.py 9 | RUN echo "c.NotebookApp.allow_origin = '*'" > $JUPYTER_CONF_FILE 10 | RUN echo "c.NotebookApp.ip = '0.0.0.0'" >> $JUPYTER_CONF_FILE 11 | RUN mkdir -p /notebook 12 | COPY test/resources/datasets/*.csv /notebook/datasets/ 13 | WORKDIR /notebook 14 | CMD jupyter notebook --allow-root --no-browser 15 | -------------------------------------------------------------------------------- /test/unit/extraction/real/test_real.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from parameterized import parameterized_class 3 | from psyke import logger 4 | from test.unit import initialize 5 | 6 | 7 | @parameterized_class(initialize('real')) 8 | class TestReal(unittest.TestCase): 9 | 10 | def test_extract(self): 11 | logger.info(self.expected_theory) 12 | logger.info(self.extracted_theory) 13 | self.assertTrue(self.expected_theory.equals(self.extracted_theory, False)) 14 | 15 | def test_predict(self): 16 | self.assertTrue(all(self.extracted_test_y_from_theory == self.extracted_test_y_from_extractor)) 17 | 18 | 19 | if __name__ == '__main__': 20 | unittest.main() 21 | -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/gridrex/__init__.py: -------------------------------------------------------------------------------- 1 | from psyke import get_default_random_seed, Target 2 | from psyke.extraction.hypercubic import Grid, RegressionCube 3 | from psyke.extraction.hypercubic.gridex import GridEx 4 | 5 | 6 | class GridREx(GridEx): 7 | """ 8 | Explanator implementing GridREx algorithm, doi:10.24963/kr.2022/57. 9 | """ 10 | 11 | def __init__(self, predictor, grid: Grid, min_examples: int, threshold: float, normalization, 12 | seed=get_default_random_seed()): 13 | super().__init__(predictor, grid, min_examples, threshold, Target.REGRESSION, None, normalization, seed) 14 | 15 | def _default_cube(self, dimensions=None) -> RegressionCube: 16 | return RegressionCube() 17 | -------------------------------------------------------------------------------- /psyke/utils/sorted.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Any 2 | 3 | 4 | class SortedList(list): 5 | 6 | def __init__(self, comparator: Callable[[Any, Any], int]): 7 | super().__init__() 8 | self.comparator = comparator 9 | 10 | def add(self, item) -> None: 11 | if len(self) == 0: 12 | self.insert(0, item) 13 | else: 14 | starting_len = len(self) 15 | for index, element in enumerate(self): 16 | if self.comparator(element, item) > 0: 17 | self.insert(index, item) 18 | break 19 | if len(self) == starting_len: 20 | self.append(item) 21 | 22 | def add_all(self, other) -> None: 23 | for item in other: 24 | self.add(item) 25 | -------------------------------------------------------------------------------- /psyke/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from typing import Iterable 3 | 4 | from psyke import Clustering, Target 5 | from psyke.extraction.hypercubic import HyperCube 6 | from psyke.hypercubepredictor import HyperCubePredictor 7 | 8 | 9 | class HyperCubeClustering(HyperCubePredictor, Clustering, ABC): 10 | 11 | def __init__(self, output: Target = Target.CONSTANT, discretization=None, normalization=None): 12 | HyperCubePredictor.__init__(self, output=output, discretization=discretization, normalization=normalization) 13 | self._protected_features = [] 14 | 15 | def get_hypercubes(self) -> Iterable[HyperCube]: 16 | raise NotImplementedError('get_hypercubes') 17 | 18 | def make_fair(self, features: Iterable[str]): 19 | self._protected_features = features 20 | -------------------------------------------------------------------------------- /test/unit/utils/test_simplify.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from tuprolog.theory import mutable_theory, theory 3 | from tuprolog.theory.parsing import parse_theory 4 | from psyke.utils.logic import simplify 5 | 6 | 7 | class TestSimplify(unittest.TestCase): 8 | 9 | def test_simplify(self): 10 | # TODO: if numbers are not float equals method return false (e.g., 2 instead of 2.0). @Giovanni 2ppy 11 | textual_theory = "p(X, Y, inside) :- ('=<'(X, 1.0), '>'(Y, 2.0), '=<'(X, 0.5))." 12 | textual_simplified_theory = "p(X, Y, inside) :- ('=<'(X, 0.5), '>'(Y, 2.0))." 13 | long_theory = mutable_theory(parse_theory(textual_theory)) 14 | simplified_theory = theory(parse_theory(textual_simplified_theory)) 15 | 16 | self.assertTrue(simplified_theory.equals(simplify(long_theory), False)) 17 | 18 | 19 | if __name__ == '__main__': 20 | unittest.main() 21 | -------------------------------------------------------------------------------- /test/unit/utils/test_simplify_formatter.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.tree import DecisionTreeRegressor 4 | from psyke import Extractor, get_default_random_seed 5 | from psyke.extraction.hypercubic import Grid 6 | from test import get_dataset 7 | 8 | 9 | class TestSimplifyFormatter(unittest.TestCase): 10 | 11 | def test_simplify_formatter(self): 12 | data = get_dataset('house') 13 | train, test = train_test_split(data, test_size=0.5, random_state=get_default_random_seed()) 14 | predictor = DecisionTreeRegressor() 15 | predictor.fit(train.iloc[:, :-1], train.iloc[:, -1]) 16 | extractor = Extractor.gridrex(predictor, Grid()) 17 | theory = extractor.extract(train) 18 | # print(pretty_theory(theory)) 19 | 20 | 21 | if __name__ == '__main__': 22 | unittest.main() 23 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": [ 3 | "config:base", 4 | ":rebaseStalePrs", 5 | ":semanticCommits", 6 | "docker:disable" 7 | ], 8 | "assignees": [ 9 | "MatteoMagnini" 10 | ], 11 | "automerge": true, 12 | "dependencyDashboard": true, 13 | "git-submodules": { 14 | "enabled": true 15 | }, 16 | "includeForks": true, 17 | "packageRules": [ 18 | { 19 | "description": "Updates to GitHub Actions should be tagged as 'ci'", 20 | "matchPaths": [ 21 | ".github/workflows/*.yml", 22 | ".github/workflows/*.yaml" 23 | ], 24 | "semanticCommitType": "ci" 25 | }, 26 | { 27 | "matchPackageNames": ["net.sourceforge.plantuml:plantuml"], 28 | "allowedVersions": "/^1\\./" 29 | } 30 | ], 31 | "prConcurrentLimit": 25, 32 | "prHourlyLimit": 0, 33 | "separateMajorMinor": true, 34 | "separateMinorPatch": true, 35 | "separateMultipleMajor": true 36 | } 37 | -------------------------------------------------------------------------------- /test/unit/extraction/hypercubic/iter/test_iter.py: -------------------------------------------------------------------------------- 1 | from psyke import logger 2 | from parameterized import parameterized_class 3 | from psyke.utils import get_default_precision 4 | from test.unit import initialize 5 | import unittest 6 | 7 | 8 | @parameterized_class(initialize('iter')) 9 | class TestIter(unittest.TestCase): 10 | 11 | def test_extract(self): 12 | logger.info(self.expected_theory) 13 | logger.info(self.extracted_theory) 14 | self.assertTrue(self.expected_theory.equals(self.extracted_theory, False)) 15 | 16 | def test_predict(self): 17 | if isinstance(self.extracted_test_y_from_theory[0], str): 18 | self.assertTrue(all(self.extracted_test_y_from_theory == self.extracted_test_y_from_extractor)) 19 | else: 20 | self.assertTrue(max(abs(self.extracted_test_y_from_theory - self.extracted_test_y_from_extractor)) < 21 | get_default_precision()) 22 | 23 | 24 | if __name__ == '__main__': 25 | unittest.main() 26 | -------------------------------------------------------------------------------- /psyke/extraction/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | import pandas as pd 4 | from tuprolog.theory import Theory 5 | 6 | from psyke import Extractor 7 | 8 | 9 | class PedagogicalExtractor(Extractor, ABC): 10 | 11 | def __init__(self, predictor, discretization=None, normalization=None): 12 | Extractor.__init__(self, predictor=predictor, discretization=discretization, normalization=normalization) 13 | 14 | def _substitute_output(self, dataframe: pd.DataFrame) -> pd.DataFrame: 15 | new_y = pd.DataFrame(self.predictor.predict(dataframe.iloc[:, :-1])).set_index(dataframe.index) 16 | data = dataframe.iloc[:, :-1].copy().join(new_y) 17 | data.columns = dataframe.columns 18 | return data 19 | 20 | def extract(self, dataframe: pd.DataFrame) -> Theory: 21 | self.theory = self._extract(self._substitute_output(dataframe)) 22 | return self.theory 23 | 24 | def _extract(self, dataframe: pd.DataFrame) -> Theory: 25 | raise NotImplementedError('extract') 26 | -------------------------------------------------------------------------------- /test/unit/extraction/hypercubic/gridex/test_gridex.py: -------------------------------------------------------------------------------- 1 | from psyke import logger 2 | from parameterized import parameterized_class 3 | from test.unit import initialize 4 | import unittest 5 | 6 | 7 | @parameterized_class(initialize('gridex')) 8 | class TestGridEx(unittest.TestCase): 9 | 10 | def test_extract(self): 11 | logger.info(self.expected_theory) 12 | logger.info(self.extracted_theory) 13 | # This test does not pass the ci, however it is not clear to me why (local ok). Could it be non-deterministic? 14 | # self.assertTrue(self.expected_theory.equals(self.extracted_theory, False)) 15 | 16 | def test_predict(self): 17 | if isinstance(self.extracted_test_y_from_theory[0], str): 18 | self.assertTrue(all(self.extracted_test_y_from_theory == self.extracted_test_y_from_extractor)) 19 | else: 20 | # TODO: check this! 21 | self.assertTrue(max(abs(self.extracted_test_y_from_theory - self.extracted_test_y_from_extractor)) < 0.05) 22 | 23 | 24 | if __name__ == '__main__': 25 | unittest.main() 26 | -------------------------------------------------------------------------------- /test/unit/extraction/trepan/test_trepan.py: -------------------------------------------------------------------------------- 1 | from cmath import isclose 2 | from parameterized import parameterized_class 3 | from psyke import logger 4 | from psyke.utils import get_default_precision 5 | from psyke.utils.logic import pretty_theory 6 | from test.unit import initialize 7 | import unittest 8 | 9 | 10 | @parameterized_class(initialize('trepan')) 11 | class TestTrepan(unittest.TestCase): 12 | 13 | def test_extract(self): 14 | logger.info(pretty_theory(self.expected_theory) + '\n') 15 | logger.info(pretty_theory(self.extracted_theory) + '\n') 16 | self.assertTrue(self.expected_theory.equals(self.extracted_theory, False)) 17 | 18 | def test_predict(self): 19 | if isinstance(self.extracted_test_y_from_theory[0], str): 20 | self.assertTrue(all(self.extracted_test_y_from_theory == self.extracted_test_y_from_extractor)) 21 | else: 22 | self.assertTrue(max(abs(self.extracted_test_y_from_theory - self.extracted_test_y_from_extractor)) < 23 | get_default_precision()) 24 | 25 | 26 | if __name__ == '__main__': 27 | unittest.main() 28 | -------------------------------------------------------------------------------- /test/unit/extraction/trepan/test_split.py: -------------------------------------------------------------------------------- 1 | from psyke.extraction.trepan import Node, Split 2 | from test import get_dataset 3 | import math 4 | import pandas as pd 5 | import unittest 6 | 7 | 8 | class TestSplit(unittest.TestCase): 9 | 10 | dataset: pd.DataFrame = get_dataset('iris') 11 | n_examples = dataset.shape[0] 12 | all_node = Node(dataset, n_examples) 13 | setosa_40 = Node(dataset.iloc[10:70, :], n_examples) 14 | setosa_40_complementar = Node(pd.concat([dataset.iloc[:10, :], dataset.iloc[70:, :]]), n_examples) 15 | versicolor_25 = Node(dataset.iloc[40:75, :], n_examples) 16 | versicolor_25_complementar = Node(dataset.iloc[75:110, :], n_examples) 17 | 18 | def test_priority(self): 19 | self.assertTrue(math.isclose(-40/60-50/90-100, 20 | Split(self.all_node, (self.setosa_40, self.setosa_40_complementar)).priority)) 21 | self.assertTrue(math.isclose((25 / 35) * - 2 - 200 + 200, 22 | Split(self.all_node, (self.versicolor_25, self.versicolor_25_complementar)) 23 | .priority)) 24 | 25 | 26 | if __name__ == '__main__': 27 | unittest.main() 28 | -------------------------------------------------------------------------------- /.github/workflows/dockerify.yml: -------------------------------------------------------------------------------- 1 | name: dockerify 2 | on: 3 | workflow_run: 4 | workflows: 5 | - deploy 6 | types: 7 | - completed 8 | branches: 9 | - master 10 | - develop 11 | env: 12 | PROJECT_NAME: psyke-python 13 | WORKFLOW: dockerify 14 | RETRY_TIME: 5m 15 | MAX_RETRIES: 3 16 | jobs: 17 | dockerify: 18 | runs-on: ubuntu-latest 19 | if: ${{ github.event.workflow_run.conclusion == 'success' }} 20 | name: Dockerify with Jupyter support 21 | steps: 22 | - name: Docker Login 23 | run: docker login -u ${{ secrets.DOCKERHUB_USERANAME }} -p ${{ secrets.DOCKERHUB_PASSWORD }} 24 | 25 | - name: Checkout code 26 | uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6 27 | with: 28 | fetch-depth: 0 # all history 29 | submodules: recursive 30 | 31 | - name: Get All Tags 32 | run: git fetch --tags -f 33 | 34 | - name: Get Version 35 | id: get-version 36 | run: echo ::set-output name=version::$(python setup.py get_project_version | tail -n 1) 37 | 38 | - name: Create Docker Image 39 | run: ./.github/scripts/retry.sh "docker build -t pikalab/psyke:$PSYKE_VERSION --build-arg PSYKE_VERSION=$PSYKE_VERSION ." 40 | shell: bash 41 | env: 42 | PSYKE_VERSION: '${{ steps.get-version.outputs.version }}' 43 | 44 | - name: Push Image on Docker Hub 45 | run: docker push pikalab/psyke:${{ steps.get-version.outputs.version }} 46 | -------------------------------------------------------------------------------- /test/unit/extraction/cart/test_cart.py: -------------------------------------------------------------------------------- 1 | from parameterized import parameterized_class 2 | from psyke.utils import get_default_precision 3 | from psyke import logger 4 | from test.unit import initialize 5 | import unittest 6 | 7 | """ 8 | TODO (?): right now there is a small chance that corner data are wrongly predicted (that is fine for now). 9 | In other words, if we use the extracted rules (with a specific default accuracy fo float) 10 | and compare their result with the one obtained by the actual decision tree (thresholds do not have truncated float) 11 | they may be different. To avoid this, when we will refactor all extractor we will also address this issue. 12 | """ 13 | 14 | 15 | @parameterized_class(initialize('cart')) 16 | class TestCart(unittest.TestCase): 17 | 18 | def test_extract(self): 19 | logger.info(self.expected_theory) 20 | logger.info(self.extracted_theory) 21 | self.assertTrue(self.expected_theory.equals(self.extracted_theory, False)) 22 | 23 | def test_predict(self): 24 | # self.assertEqual(self.extracted_test_y_from_theory, self.extracted_test_y_from_pruned_theory) 25 | if isinstance(self.extracted_test_y_from_theory[0], str): 26 | self.assertTrue(all(self.extracted_test_y_from_theory == self.extracted_test_y_from_extractor)) 27 | else: 28 | self.assertTrue(max(abs(self.extracted_test_y_from_theory - self.extracted_test_y_from_extractor)) < 29 | get_default_precision()) 30 | 31 | 32 | if __name__ == '__main__': 33 | unittest.main() 34 | -------------------------------------------------------------------------------- /psyke/clustering/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from kneed import KneeLocator 4 | from sklearn.cluster import DBSCAN 5 | from sklearn.mixture import GaussianMixture 6 | from sklearn.neighbors import NearestNeighbors 7 | 8 | 9 | def select_gaussian_mixture(data: pd.DataFrame, max_components) -> tuple[float, int, GaussianMixture]: 10 | components = range(2, max_components + 1) 11 | try: 12 | models = [GaussianMixture(n_components=n).fit(data) for n in components if n <= len(data)] 13 | except ValueError: 14 | print(len(data)) 15 | return min([(m.bic(data) / (i + 2), (i + 2), m) for i, m in enumerate(models)]) 16 | 17 | 18 | def select_dbscan_epsilon(data: pd.DataFrame, clusters: int) -> float: 19 | neighbors = NearestNeighbors(n_neighbors=min(len(data.columns) * 2, len(data))).fit(data) 20 | distances = sorted(np.mean(neighbors.kneighbors(data)[1], axis=1), reverse=True) 21 | try: 22 | kn = KneeLocator([d for d in range(len(distances))], distances, 23 | curve='convex', direction='decreasing', online=True) 24 | if kn.knee is None or kn.knee_y is None: 25 | epsilon = max(distances[-1], 1e-3) 26 | else: 27 | epsilon = kn.knee_y 28 | except (RuntimeWarning, UserWarning, ValueError): 29 | epsilon = max(distances[-1], 1e-3) 30 | k = 1. 31 | dbscan_pred = DBSCAN(eps=epsilon * k).fit_predict(data.iloc[:, :-1]) 32 | # while Counter(dbscan_pred).most_common(1)[0][0] == -1: 33 | for i in range(1000): 34 | if len(np.unique(dbscan_pred)) < clusters + 1: 35 | break 36 | k += .1 37 | dbscan_pred = DBSCAN(eps=epsilon * k).fit_predict(data.iloc[:, :-1]) 38 | return epsilon * k 39 | -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import math 3 | import warnings 4 | 5 | warnings.simplefilter("ignore") 6 | 7 | Dimension = tuple[float, float] 8 | Dimensions = dict[str, Dimension] 9 | 10 | 11 | class Expansion: 12 | 13 | def __init__(self, cube, feature: str, direction: str, distance: float = math.nan): 14 | self.cube = cube 15 | self.feature = feature 16 | self.direction = direction 17 | self.distance = distance 18 | 19 | def __getitem__(self, index: int) -> float: 20 | return self.cube[self.feature][index] 21 | 22 | def boundaries(self, a: float, b: float) -> (float, float): 23 | return (self[0], b) if self.direction == '-' else (a, self[1]) 24 | 25 | 26 | class Limit: 27 | 28 | def __init__(self, feature: str, direction: str): 29 | self.feature = feature 30 | self.direction = direction 31 | 32 | def __eq__(self, other): 33 | return (self.feature == other.feature) and (self.direction == other.direction) 34 | 35 | def __hash__(self): 36 | return hash(self.feature + self.direction) 37 | 38 | 39 | class MinUpdate: 40 | 41 | def __init__(self, name: str, value: float): 42 | self.name = name 43 | self.value = value 44 | 45 | 46 | class ZippedDimension: 47 | 48 | def __init__(self, name: str, this_dimension: Dimension, other_dimension: Dimension): 49 | self.name = name 50 | self.this_dimension = this_dimension 51 | self.other_dimension = other_dimension 52 | 53 | def __eq__(self, other: ZippedDimension) -> bool: 54 | return (self.name == other.name) and (self.this_dimension == other.this_dimension) and \ 55 | (self.other_dimension == other.other_dimension) 56 | 57 | -------------------------------------------------------------------------------- /test/unit/extraction/real/test_rule.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from psyke.extraction.real.utils import Rule 3 | from psyke.utils.dataframe import split_features 4 | from test import get_dataset 5 | 6 | 7 | class TestRule(unittest.TestCase): 8 | 9 | def test_subrule(self): 10 | pred_1, pred_2 = ['V1', 'V2'], ['V3', 'V4'] 11 | rule_1 = Rule(pred_1, pred_2) 12 | self.assertTrue(rule_1 in rule_1) 13 | rule_2 = Rule(pred_2, pred_1) 14 | self.assertFalse(rule_1 in rule_2) 15 | self.assertFalse(rule_2 in rule_1) 16 | rule_3 = Rule(['V1'], ['V3']) 17 | self.assertTrue(rule_1 in rule_3) 18 | self.assertFalse(rule_3 in rule_1) 19 | self.assertFalse(rule_2 in rule_3) 20 | self.assertFalse(rule_3 in rule_2) 21 | rule_4 = Rule(["V1"], ["V5"]) 22 | self.assertFalse(rule_1 in rule_4) 23 | self.assertFalse(rule_4 in rule_1) 24 | rule_5 = Rule(["V1", "V6"], ["V3", "V4"]) 25 | self.assertFalse(rule_1 in rule_5) 26 | self.assertFalse(rule_5 in rule_1) 27 | self.assertTrue(rule_1 in Rule([], [])) 28 | 29 | def test_reduce(self): 30 | dataset = get_dataset('iris') 31 | features = split_features(dataset) 32 | rule = Rule(["V1_1", "V2_2", "V3_0"], 33 | ["V1_0", "V2_1", "V2_0", "V4_1", "V4_2"]) 34 | reduced_rule = Rule(["V1_1", "V2_2", "V3_0"], 35 | ["V4_1", "V4_2"]) 36 | self.assertEqual(reduced_rule.true_predicates, rule.reduce(features).true_predicates) 37 | self.assertEqual(reduced_rule.false_predicates, rule.reduce(features).false_predicates) 38 | self.assertEqual(reduced_rule.true_predicates, reduced_rule.reduce(features).true_predicates) 39 | self.assertEqual(reduced_rule.false_predicates, reduced_rule.reduce(features).false_predicates) 40 | 41 | 42 | if __name__ == '__main__': 43 | unittest.main() -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/cosmik/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.mixture import GaussianMixture 4 | from tuprolog.theory import Theory 5 | 6 | from psyke import Target, Extractor, get_default_random_seed 7 | from psyke.clustering.utils import select_gaussian_mixture 8 | from psyke.extraction.hypercubic import HyperCube, HyperCubeExtractor, RegressionCube 9 | 10 | 11 | class COSMiK(HyperCubeExtractor): 12 | """ 13 | Explanator implementing COSMiK algorithm. 14 | """ 15 | 16 | def __init__(self, predictor, max_components: int = 4, k: int = 5, patience: int = 15, close_to_center: bool = True, 17 | output: Target = Target.CONSTANT, discretization=None, normalization=None, 18 | seed: int = get_default_random_seed()): 19 | super().__init__(predictor, Target.REGRESSION, discretization, normalization) 20 | self.max = max_components 21 | self.k = k 22 | self.patience = patience 23 | self.output = output 24 | self.close_to_center = close_to_center 25 | self.seed = seed 26 | 27 | def _extract(self, dataframe: pd.DataFrame) -> Theory: 28 | np.random.seed(self.seed) 29 | X, y = dataframe.iloc[:, :-1], dataframe.iloc[:, -1] 30 | 31 | _, n, _ = select_gaussian_mixture(dataframe, self.max) 32 | gmm = GaussianMixture(n) 33 | gmm.fit(X, y) 34 | 35 | divine = Extractor.divine(gmm, self.k, self.patience, self.close_to_center, 36 | self.discretization, self.normalization) 37 | df = X.join(pd.DataFrame(gmm.predict(X))) 38 | df.columns = dataframe.columns 39 | divine.extract(df) 40 | 41 | self._hypercubes = [HyperCube(cube.dimensions.copy()) if self.output == Target.CONSTANT else 42 | RegressionCube(cube.dimensions.copy()) for cube in divine._hypercubes] 43 | for cube in self._hypercubes: 44 | cube.update(dataframe, self.predictor) 45 | 46 | self._sort_cubes() 47 | return self._create_theory(dataframe) -------------------------------------------------------------------------------- /psyke/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from math import log10 3 | from random import Random 4 | 5 | _DEFAULT_RANDOM_SEED: int = 123 6 | 7 | ONNX_EXTENSION: str = '.onnx' 8 | 9 | _random_options = dict(_deterministic_mode=True, _default_random_seed=_DEFAULT_RANDOM_SEED) 10 | 11 | _random_seed_generator: Random = Random(_DEFAULT_RANDOM_SEED) 12 | 13 | _DEFAULT_PRECISION: float = 1e-6 14 | 15 | _precision_options: dict = {'precision': _DEFAULT_PRECISION} 16 | 17 | 18 | class TypeNotAllowedException(Exception): 19 | 20 | def __init__(self, type_name: str): 21 | super().__init__('Type "' + type_name + '" not allowed for discretization.') 22 | 23 | 24 | class Range: 25 | def __init__(self, mean: float, std: float): 26 | self.mean = mean 27 | self.std = std 28 | self.lower = mean 29 | self.upper = mean 30 | 31 | def left_infinite(self): 32 | self.lower = float('-inf') 33 | 34 | def right_infinite(self): 35 | self.upper = float('inf') 36 | 37 | def expand_left(self): 38 | self.lower -= self.std 39 | 40 | def expand_right(self): 41 | self.upper += self.std 42 | 43 | 44 | def is_deterministic_mode(): 45 | return _random_options['_deterministic_mode'] 46 | 47 | 48 | def set_deterministic_mode(value: bool): 49 | _random_options['_deterministic_mode'] = value 50 | 51 | 52 | def get_default_random_seed(): 53 | if is_deterministic_mode(): 54 | return _random_options['_default_random_seed'] 55 | else: 56 | return _random_seed_generator.randint(0, 1 << 64) 57 | 58 | 59 | def set_default_random_seed(value: int): 60 | _random_options['_default_random_seed'] = value 61 | 62 | 63 | def get_default_precision() -> float: 64 | return _precision_options['precision'] 65 | 66 | 67 | def get_int_precision() -> int: 68 | return -1 * int(log10(get_default_precision())) 69 | 70 | 71 | def set_default_precision(value: float): 72 | _precision_options['precision'] = value 73 | 74 | 75 | class Target(Enum): 76 | CLASSIFICATION = 1, 77 | CONSTANT = 2, 78 | REGRESSION = 3 79 | -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/strategy.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from functools import reduce 4 | from collections.abc import Iterable 5 | 6 | 7 | class Strategy: 8 | def __init__(self, partitions = None): 9 | self._partitions = partitions 10 | self._no_features = [] 11 | 12 | def get(self, feature: str) -> int: 13 | raise NotImplementedError 14 | 15 | def make_fair(self, features: Iterable[str]): 16 | self._no_features = features 17 | 18 | def partition_number(self, features: Iterable[str]) -> int: 19 | return reduce(lambda x, y: x * y, map(self.get, features), 1) 20 | 21 | def equals(self, strategy, features: Iterable[str]) -> bool: 22 | eq = True 23 | for f in features: 24 | eq = eq and self.get(f) == strategy.get(f) 25 | return eq 26 | 27 | def __str__(self): 28 | return self._partitions 29 | 30 | def __repr__(self): 31 | return self.__str__() 32 | 33 | 34 | class FixedStrategy(Strategy): 35 | def __init__(self, partitions: int = 2): 36 | super().__init__(partitions) 37 | 38 | def get(self, feature: str) -> int: 39 | return 1 if feature in self._no_features else self._partitions 40 | 41 | def __str__(self): 42 | return "Fixed ({})".format(super().__str__()) 43 | 44 | 45 | class AdaptiveStrategy(Strategy): 46 | def __init__(self, features: Iterable[(str, float)], partitions: Iterable[tuple[float, float]] | None = None): 47 | super().__init__(partitions if partitions is not None else [(0.33, 2), (0.67, 3)]) 48 | self.features = features 49 | 50 | def get(self, feature: str) -> int: 51 | if feature in self._no_features: 52 | return 1 53 | importance = next(filter(lambda t: t[0] == feature, self.features))[1] 54 | n = 1 55 | for (imp, part) in self._partitions: 56 | if importance >= imp: 57 | n = part 58 | else: 59 | break 60 | return n 61 | 62 | def __str__(self): 63 | return "Adaptive ({})".format(super().__str__()) 64 | -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/creepy/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections.abc import Iterable 4 | from typing import Callable, Any 5 | 6 | import pandas as pd 7 | from sklearn.base import ClassifierMixin 8 | from tuprolog.theory import Theory 9 | from psyke import Clustering 10 | from psyke.clustering import HyperCubeClustering 11 | from psyke.extraction.hypercubic import HyperCubeExtractor 12 | from psyke.utils import Target, get_default_random_seed 13 | 14 | 15 | class CReEPy(HyperCubeExtractor): 16 | """ 17 | Explanator implementing CReEPy algorithm. 18 | """ 19 | 20 | ClusteringType = Callable[[int, float, Target, int, Any, Any, int], HyperCubeClustering] 21 | 22 | def __init__(self, predictor, clustering: ClusteringType = Clustering.exact, depth: int = 3, 23 | error_threshold: float = 0.1, output: Target = Target.CONSTANT, gauss_components: int = 5, 24 | ranks: Iterable[(str, float)] = tuple(), ignore_threshold: float = 0.0, discretization=None, 25 | normalization=None, seed: int = get_default_random_seed()): 26 | super().__init__(predictor, Target.CLASSIFICATION if isinstance(predictor, ClassifierMixin) else output, 27 | discretization, normalization) 28 | self.clustering = clustering(depth, error_threshold, self._output, gauss_components, discretization, 29 | normalization, seed) 30 | self._default_surrounding_cube = True 31 | self._dimensions_to_ignore = set([dimension for dimension, relevance in ranks if relevance < ignore_threshold]) 32 | self._protected_features = [] 33 | 34 | def make_fair(self, features: Iterable[str]): 35 | self.clustering.make_fair(features) 36 | self._dimensions_to_ignore.update(features) 37 | 38 | def _extract(self, dataframe: pd.DataFrame) -> Theory: 39 | if not isinstance(self.clustering, HyperCubeClustering): 40 | raise TypeError("clustering must be a HyperCubeClustering") 41 | 42 | self.clustering.fit(dataframe) 43 | self._hypercubes = self.clustering.get_hypercubes() 44 | self._surrounding = self._hypercubes[-1] 45 | return self._create_theory(dataframe) 46 | -------------------------------------------------------------------------------- /psyke/extraction/real/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from psyke import DiscreteFeature 3 | from typing import Iterable 4 | import pandas as pd 5 | 6 | 7 | class Rule: 8 | 9 | def __init__(self, true_predicates: list[str], false_predicates: list[str]): 10 | self.true_predicates = true_predicates 11 | self.false_predicates = false_predicates 12 | 13 | def __contains__(self, other: Rule) -> bool: 14 | return all([predicate in other.true_predicates for predicate in self.true_predicates]) and\ 15 | all([predicate in other.false_predicates for predicate in self.false_predicates]) 16 | 17 | def __eq__(self, other: Rule) -> bool: 18 | return self.true_predicates == other.true_predicates and self.false_predicates == other.false_predicates 19 | 20 | def __hash__(self) -> int: 21 | return hash(self.true_predicates) + hash(self.false_predicates) 22 | 23 | def reduce(self, features: Iterable[DiscreteFeature]) -> Rule: 24 | to_be_removed = [item for tp in self.true_predicates 25 | for feature in features if tp in feature.admissible_values 26 | for item in feature.admissible_values.keys()] 27 | return Rule(self.true_predicates, [fp for fp in self.false_predicates if fp not in to_be_removed]) 28 | 29 | def to_lists(self) -> list[list[str]]: 30 | return [self.true_predicates.copy(), self.false_predicates.copy()] 31 | 32 | 33 | class IndexedRuleSet(dict[int, list[Rule]]): 34 | 35 | def flatten(self) -> list[tuple[int, Rule]]: 36 | return [(key, value) for key, values in self.items() for value in values] 37 | 38 | def optimize(self) -> IndexedRuleSet: 39 | useless_rules = [item for key, entry in self.items() for item in IndexedRuleSet._useless_rules(key, entry)] 40 | for rule in useless_rules: 41 | self[rule[0]].remove(rule[1]) 42 | return self 43 | 44 | @staticmethod 45 | def _useless_rules(key, rules: list[Rule]) -> list[(int, Rule)]: 46 | return [ 47 | (key, rule) for rule in rules 48 | if any(rule in other_rule for other_rule in rules if other_rule != rule) 49 | ] 50 | 51 | @staticmethod 52 | def create_indexed_ruleset(indices: Iterable) -> IndexedRuleSet: 53 | return IndexedRuleSet({i: [] for i in indices}) 54 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: deploy 2 | on: 3 | workflow_run: 4 | workflows: 5 | - check 6 | types: 7 | - completed 8 | branches: 9 | - master 10 | - develop 11 | env: 12 | PROJECT_NAME: psyke-python 13 | WORKFLOW: depoly 14 | jobs: 15 | deploy: 16 | runs-on: ubuntu-latest 17 | if: ${{ github.event.workflow_run.conclusion == 'success' }} 18 | name: Deploy on PyPI and create release 19 | steps: 20 | - name: Checkout code 21 | uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6 22 | with: 23 | fetch-depth: 0 # all history 24 | submodules: recursive 25 | 26 | - name: Get All Tags 27 | run: git fetch --tags -f 28 | 29 | - name: Get Python Version 30 | id: get-python-version 31 | run: echo ::set-output name=version::$(cat .python-version) 32 | 33 | - name: Setup Python 34 | uses: actions/setup-python@v6 35 | with: 36 | python-version: ${{ steps.get-python-version.outputs.version }} 37 | 38 | - name: Restore Python dependencies 39 | run: | 40 | pip install -r requirements.txt 41 | 42 | - name: Change default logging level 43 | run: sed -i -e 's/DEBUG/WARN/g' psyke/__init__.py 44 | 45 | - name: Pack 46 | run: python -m build 47 | 48 | - name: Archive Dist Artifacts 49 | if: failure() || success() 50 | uses: actions/upload-artifact@v6 51 | with: 52 | name: dist 53 | path: './dist' 54 | 55 | - name: Upload 56 | run: python -m twine upload dist/* 57 | env: 58 | TWINE_USERNAME: ${{ secrets.PYPI_USERANAME }} 59 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 60 | 61 | - name: Get Version 62 | id: get-version 63 | run: echo ::set-output name=version::$(python setup.py get_project_version | tail -n 1) 64 | 65 | - name: Release Assets 66 | id: upload-release-assets 67 | run: | 68 | set -x 69 | ASSETS=() 70 | for A in dist/*; do 71 | ASSETS+=("-a" "$A") 72 | echo "Releasing $A" 73 | done 74 | RELEASE_TAG='${{ steps.get-version.outputs.version }}' 75 | gh release create "$RELEASE_TAG" 76 | env: 77 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 78 | -------------------------------------------------------------------------------- /test/unit/utils/test_prune.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from tuprolog.theory import mutable_theory, theory 3 | from tuprolog.theory.parsing import parse_theory 4 | from psyke.utils.logic import prune 5 | 6 | 7 | class TestPrune(unittest.TestCase): 8 | 9 | def test_prune_documentation(self): 10 | theory1 = "c(A, B, C, D, positive) :- ('=<'(A, 1), '>'(B, 2), '=='(C, 0)). " \ 11 | + "c(A, B, C, D, positive) :- ('=<'(A, 1), '>'(B, 2))." 12 | pruned1 = "c(A, B, C, D, positive) :- ('=<'(A, 1), '>'(B, 2))." 13 | 14 | theory2 = "c(A, B, C, D, positive) :- ('=<'(A, 1), '>'(B, 2), '=='(C, 0)). " \ 15 | + "c(A, B, C, D, positive) :- ('=<'(A, 1.3), '>'(B, 1.8))." 16 | pruned2 = "c(A, B, C, D, positive) :- ('=<'(A, 1.3), '>'(B, 1.8))." 17 | 18 | theory3 = "c(A, B, C, D, positive) :- ('=<'(A, 1.3), '>'(B, 1.8)). " \ 19 | + "c(A, B, C, D, positive) :- ('=<'(A, 1), '>'(B, 2), '=='(C, 0))." 20 | pruned3 = pruned2 21 | 22 | self.assertTrue(theory(parse_theory(pruned1)).equals(prune(mutable_theory(parse_theory(theory1))), False)) 23 | self.assertTrue(theory(parse_theory(pruned2)).equals(prune(mutable_theory(parse_theory(theory2))), False)) 24 | self.assertTrue(theory(parse_theory(pruned3)).equals(prune(mutable_theory(parse_theory(theory3))), False)) 25 | 26 | def test_prune_success(self): 27 | textual_theory = "p(X, Y, inside) :- ('=<'(X, 1), '>'(Y, 2)). " \ 28 | + "p(X, Y, inside) :- ('=<'(X, 0.5), '>'(Y, 3))." 29 | textual_pruned_theory = "p(X, Y, inside) :- ('=<'(X, 1), '>'(Y, 2))." 30 | long_theory = mutable_theory(parse_theory(textual_theory)) 31 | pruned_theory = theory(parse_theory(textual_pruned_theory)) 32 | 33 | self.assertTrue(pruned_theory.equals(prune(long_theory), False)) 34 | 35 | def test_prune_not_applied(self): 36 | textual_theory = "p(PL, PW, SL, SW, versicolor) :- '=<'(SW, 3.6). " \ 37 | + "p(PL, PW, SL, SW, versicolor) :- ('=<'(PW, 0.35), '=<'(SL, 5.35), '=<'(SW, 3.9))." 38 | textual_pruned_theory = textual_theory 39 | long_theory = mutable_theory(parse_theory(textual_theory)) 40 | pruned_theory = theory(parse_theory(textual_pruned_theory)) 41 | 42 | self.assertTrue(pruned_theory.equals(prune(long_theory), False)) 43 | 44 | 45 | if __name__ == '__main__': 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /psyke/utils/metrics.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, f1_score 4 | 5 | 6 | def mae(expected, predicted): 7 | """ 8 | Calculates the predictions' MAE w.r.t. the instances given as input. 9 | 10 | :param expected: the expected data . 11 | :param predicted: the predicted data. 12 | :return: the mean absolute error (MAE) of the predictions. 13 | """ 14 | return score(expected, predicted, mean_absolute_error) 15 | 16 | 17 | def mse(expected, predicted): 18 | """ 19 | Calculates the predictions' MSE w.r.t. the instances given as input. 20 | 21 | :param expected: the expected data . 22 | :param predicted: the predicted data. 23 | :return: the mean squared error (MSE) of the predictions. 24 | """ 25 | return score(expected, predicted, mean_squared_error) 26 | 27 | 28 | def r2(expected, predicted): 29 | """ 30 | Calculates the predictions' R2 w.r.t. the instances given as input. 31 | 32 | :param expected: the expected data . 33 | :param predicted: the predicted data. 34 | :return: the R2 score of the predictions. 35 | """ 36 | return score(expected, predicted, r2_score) 37 | 38 | 39 | def accuracy(expected, predicted): 40 | """ 41 | Calculates the predictions' classification accuracy w.r.t. the instances given as input. 42 | 43 | :param expected: the expected data . 44 | :param predicted: the predicted data. 45 | :return: the classification accuracy of the predictions. 46 | """ 47 | return score(expected, predicted, accuracy_score) 48 | 49 | 50 | def f1(expected, predicted): 51 | """ 52 | Calculates the predictions' F1 score w.r.t. the instances given as input. 53 | 54 | :param expected: the expected data . 55 | :param predicted: the predicted data. 56 | :return: the F1 score of the predictions. 57 | """ 58 | return score(expected, predicted, partial(f1_score, average='weighted')) 59 | 60 | 61 | def score(expected, predicted, scoring_function): 62 | """ 63 | Calculates the predictions' score w.r.t. the instances given as input with the provided scoring function. 64 | 65 | :param expected: the expected data . 66 | :param predicted: the predicted data. 67 | :param scoring_function: the scoring function to be used. 68 | :return: the score of the predictions. 69 | """ 70 | idx = [prediction is not None for prediction in predicted] 71 | return scoring_function(expected[idx], predicted[idx]) 72 | -------------------------------------------------------------------------------- /psyke/extraction/cart/FairTreePredictor.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Union, Any 3 | 4 | from psyke.extraction.cart import FairTreeClassifier, FairTreeRegressor, LeafSequence, LeafConstraints 5 | from psyke.extraction.cart.CartPredictor import CartPredictor 6 | from psyke.schema import LessThan, GreaterThan, SchemaException, Value 7 | 8 | 9 | class FairTreePredictor(CartPredictor): 10 | """ 11 | A wrapper for fair decision and regression trees of psyke. 12 | """ 13 | 14 | def __init__(self, predictor: Union[FairTreeClassifier, FairTreeRegressor] = FairTreeClassifier(), 15 | discretization=None, normalization=None): 16 | super().__init__(predictor, discretization, normalization) 17 | 18 | def __iter__(self) -> LeafSequence: 19 | leaves = [node for node in self.recurse(self._predictor.root, {})] 20 | return (leaf for leaf in leaves) 21 | 22 | @staticmethod 23 | def merge_constraints(constraints: LeafConstraints, constraint: Value, feature: str): 24 | if feature in constraints: 25 | try: 26 | constraints[feature][-1] *= constraint 27 | except SchemaException: 28 | constraints[feature].append(constraint) 29 | else: 30 | constraints[feature] = [constraint] 31 | return constraints 32 | 33 | def recurse(self, node, constraints) -> Union[LeafSequence, tuple[LeafConstraints, Any]]: 34 | if node.is_leaf_node(): 35 | return constraints, node.value 36 | 37 | feature = node.feature 38 | threshold = node.threshold if self.normalization is None else \ 39 | (node.threshold * self.normalization[feature][1] + self.normalization[feature][0]) 40 | 41 | left = self.recurse(node.left, self.merge_constraints(copy.deepcopy(constraints), LessThan(threshold), feature)) 42 | right = self.recurse(node.right, self.merge_constraints(copy.deepcopy(constraints), 43 | GreaterThan(threshold), feature)) 44 | return (left if isinstance(left, list) else [left]) + (right if isinstance(right, list) else [right]) 45 | 46 | @property 47 | def predictor(self) -> Union[FairTreeClassifier, FairTreeRegressor]: 48 | return self._predictor 49 | 50 | @property 51 | def n_leaves(self) -> int: 52 | return self._predictor.n_leaves 53 | 54 | @predictor.setter 55 | def predictor(self, predictor: Union[FairTreeClassifier, FairTreeRegressor]): 56 | self._predictor = predictor 57 | -------------------------------------------------------------------------------- /test/unit/extraction/cart/test_simplified_cart.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | import numpy as np 4 | from parameterized import parameterized_class 5 | from sklearn.model_selection import train_test_split 6 | from tuprolog.solve.prolog import prolog_solver 7 | from tuprolog.theory import mutable_theory 8 | 9 | from psyke import Extractor 10 | from psyke.utils import get_default_precision 11 | from psyke.utils.logic import data_to_struct, get_in_rule, get_not_in_rule 12 | from test import get_dataset, get_model 13 | import unittest 14 | 15 | 16 | # TODO: should be refactored using the a .csv file 17 | from test.unit import get_substitutions 18 | 19 | 20 | @parameterized_class([{"dataset": "iris", "predictor": "DTC", "task": "extraction"}, 21 | {"dataset": "house", "predictor": "DTR", "task": "hypercubic"}]) 22 | class TestSimplifiedCart(unittest.TestCase): 23 | 24 | def test_equality(self): 25 | dataset = get_dataset(self.dataset) 26 | dataset = dataset.reindex(sorted(dataset.columns[:-1]) + [dataset.columns[-1]], axis=1) 27 | train, test = train_test_split(dataset, test_size=0.5) 28 | tree, _ = get_model(self.predictor, {}) 29 | tree.fit(train.iloc[:, :-1], train.iloc[:, -1]) 30 | extractor = Extractor.cart(tree, simplify=False) 31 | theory = extractor.extract(train) 32 | simplified_extractor = Extractor.cart(tree) 33 | simplified_theory = simplified_extractor.extract(train) 34 | 35 | index = test.shape[1] - 1 36 | cast, substitutions = get_substitutions(test, theory) 37 | expected = [cast(query.solved_query.get_arg_at(index)) for query in substitutions] 38 | 39 | cast, simplified_substitutions = get_substitutions(test, simplified_theory) 40 | simplified_expected = [cast(query.solved_query.get_arg_at(index)) for query in simplified_substitutions] 41 | 42 | if isinstance(test.iloc[0, -1], str): 43 | self.assertTrue(all(np.array(extractor.predict(test.iloc[:, :-1])) == 44 | np.array(simplified_extractor.predict(test.iloc[:, :-1])))) 45 | self.assertEqual(expected, simplified_expected) 46 | else: 47 | self.assertTrue(max(abs(np.array(extractor.predict(test.iloc[:, :-1])) - 48 | np.array(simplified_extractor.predict(test.iloc[:, :-1]))) 49 | ) < get_default_precision()) 50 | self.assertTrue(max(abs(np.array(expected) - np.array(simplified_expected))) < get_default_precision()) 51 | 52 | 53 | if __name__ == '__main__': 54 | unittest.main() 55 | -------------------------------------------------------------------------------- /psyke/extraction/trepan/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from itertools import chain 3 | from typing import Iterable, Any 4 | import pandas as pd 5 | 6 | 7 | class Node: 8 | 9 | def __init__(self, samples: pd.DataFrame, n_examples: int, constraints: Iterable[tuple[str, float]] = None, 10 | children: list[Node] = None, depth: int = 0): 11 | self.samples = samples 12 | self.n_examples = n_examples 13 | self.constraints = [] if constraints is None else constraints 14 | self.children = [] if children is None else children 15 | self.depth = depth 16 | 17 | def __str__(self): 18 | name = ''.join(('' if c[1] > 0 else '!') + c[0] + ', ' for c in self.constraints) 19 | return name[:-2] + ' = ' + str(self.dominant) 20 | 21 | @property 22 | def priority(self) -> float: 23 | return -(self.reach * (1 - self.fidelity)) 24 | 25 | @property 26 | def fidelity(self) -> float: 27 | return 1.0 * self.correct / (self.samples.shape[0] if self.samples.shape[0] > 0 else 1) 28 | 29 | @property 30 | def reach(self) -> float: 31 | return 1.0 * self.samples.shape[0] / self.n_examples 32 | 33 | @property 34 | def correct(self) -> float: 35 | return sum(self.samples.iloc[:, -1] == self.dominant) 36 | 37 | @property 38 | def dominant(self) -> Any: 39 | return self.samples.iloc[:, -1].mode()[0] if self.samples.shape[0] > 0 else '' 40 | 41 | @property 42 | def n_classes(self) -> int: 43 | return len(set(self.samples.iloc[:, -1])) 44 | 45 | def __iter__(self) -> Iterable[Node]: 46 | for child in chain(*map(iter, self.children)): 47 | yield child 48 | yield self 49 | 50 | 51 | class Split: 52 | 53 | # TODO: should be configurable by user 54 | PRIORITY_BONUS: int = 100 55 | PRIORITY_PENALTY: int = 200 56 | 57 | def __init__(self, parent: Node, children: tuple[Node, Node]): 58 | self.parent = parent 59 | self.children = children 60 | 61 | @property 62 | def priority(self) -> float: 63 | return self.__priority(self.parent) 64 | 65 | def __priority(self, parent: Node) -> float: 66 | true_node, false_node = self.children 67 | priority = - (true_node.fidelity + false_node.fidelity) 68 | for node in [true_node, false_node]: 69 | priority -= self.PRIORITY_BONUS if parent.n_classes > node.n_classes else 0 70 | priority += self.PRIORITY_PENALTY if true_node.dominant == false_node.dominant else 0 71 | return priority 72 | 73 | 74 | class SplitLogic: 75 | 76 | DEFAULT = 1 77 | -------------------------------------------------------------------------------- /psyke/tuning/crash/__init__.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | import pandas as pd 4 | 5 | from psyke.tuning import Objective, SKEOptimizer 6 | from psyke.tuning.orchid import OrCHiD 7 | from psyke.utils import Target 8 | 9 | 10 | class CRASH(SKEOptimizer): 11 | class Algorithm(Enum): 12 | ExACT = 1, 13 | CREAM = 2 14 | 15 | def __init__(self, predictor, dataframe: pd.DataFrame, max_error_increase: float = 1.2, 16 | min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, max_depth: int = 10, 17 | max_gauss_components: int = 5, patience: int = 5, output: Target = Target.CONSTANT, 18 | objective: Objective = Objective.MODEL, normalization=None, discretization=None): 19 | super().__init__(predictor, dataframe, max_error_increase, min_rule_decrease, readability_tradeoff, 20 | patience, objective, output, normalization, discretization) 21 | self.max_depth = max_depth 22 | self.max_gauss_components = max_gauss_components 23 | 24 | def search(self): 25 | self.params = [] 26 | for algorithm in [OrCHiD.Algorithm.ExACT, OrCHiD.Algorithm.CREAM]: 27 | self.params += self.__search_algorithm(algorithm) 28 | 29 | def __search_algorithm(self, algorithm): 30 | params = [] 31 | best = None 32 | 33 | for gauss_components in range(2, self.max_gauss_components + 1): 34 | data = self.dataframe.sample(n=gauss_components * 100) if gauss_components * 100 < len(self.dataframe) \ 35 | else self.dataframe 36 | current_params = self.__search_components(data, algorithm, gauss_components) 37 | current_best = self._best(current_params)[1] 38 | if best is not None and self._score(best) <= self._score(current_best): 39 | break 40 | best = current_best 41 | params += current_params 42 | 43 | return params 44 | 45 | def __search_components(self, data, algorithm, gauss_components): 46 | orchid = OrCHiD(data, algorithm, self.output, self.max_error_increase, self.min_rule_decrease, 47 | self.readability_tradeoff, self.patience, self.max_depth, gauss_components, 48 | self.normalization, self.discretization) 49 | orchid.search() 50 | return [(*p, gauss_components, algorithm) for p in orchid.params] 51 | 52 | def _print_params(self, name, params): 53 | print("*****************************") 54 | print(f"Best {name}") 55 | print("*****************************") 56 | print(f"MAE = {params[0]:.2f}, {params[1]} rules") 57 | print(f"Algorithm = {params[5]}") 58 | print(f"Threshold = {params[3]:.2f}") 59 | print(f"Depth = {params[2]}") 60 | print(f"Gaussian components = {params[4]}") 61 | -------------------------------------------------------------------------------- /psyke/clustering/cream/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Iterable 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from psyke.utils import Target, get_default_random_seed 9 | from psyke.clustering.exact import ExACT 10 | from psyke.extraction.hypercubic import Node, HyperCube, ClosedCube 11 | from psyke.clustering.utils import select_gaussian_mixture 12 | 13 | 14 | class CREAM(ExACT): 15 | """ 16 | Explanator implementing CREAM algorithm. 17 | """ 18 | 19 | def __init__(self, depth: int, error_threshold: float, output: Target = Target.CONSTANT, gauss_components: int = 5, 20 | discretization=None, normalization=None, seed: int = get_default_random_seed()): 21 | super().__init__(depth, error_threshold, output, gauss_components, discretization, normalization, seed) 22 | 23 | def __eligible_cubes(self, gauss_pred: np.ndarray, node: Node, clusters: int): 24 | cubes = [] 25 | for i in range(len(np.unique(gauss_pred))): 26 | df = node.dataframe.iloc[np.where(gauss_pred == i)] 27 | if len(df) == 0: 28 | continue 29 | inner_cube = self._create_cube(df, clusters) 30 | indices = self._indices(inner_cube, node.dataframe) 31 | if indices is None: 32 | continue 33 | right, left = self._split(inner_cube, node.cube, node.dataframe, indices) 34 | cubes.append(( 35 | ((right.diversity + left.diversity) / 2, right.volume(), left.volume(), i), 36 | (right, indices), (left, ~indices) 37 | )) 38 | return cubes 39 | 40 | def _split(self, right: ClosedCube, outer_cube: ClosedCube, data: pd.DataFrame, indices: np.ndarray): 41 | right.update(data.iloc[indices], self._predictor) 42 | left = outer_cube.copy() 43 | left.update(data.iloc[~indices], self._predictor) 44 | return right, left 45 | 46 | def _iterate(self, surrounding: Node) -> Iterable[HyperCube]: 47 | to_split = [(self.error_threshold * 10, 1, 1, surrounding)] 48 | while len(to_split) > 0: 49 | node, depth, gauss_pred, gauss_params = self._get_gauss_predictions(to_split) 50 | cubes = self.__eligible_cubes(gauss_pred, node, gauss_params[1]) 51 | if len(cubes) < 1: 52 | continue 53 | _, right, left = min(cubes) 54 | # find_better_constraints(node.dataframe[right[1]], right[0]) 55 | node.right = Node(node.dataframe[right[1]], right[0]) 56 | node.cube.update(node.dataframe[left[1]], self._predictor) 57 | node.left = Node(node.dataframe[left[1]], left[0]) 58 | 59 | if depth < self.depth: 60 | to_split += [ 61 | (error, depth + 1, np.random.uniform(), n) for (n, error) in 62 | zip(node.children, [right[0].diversity, left[0].diversity]) if error > self.error_threshold 63 | ] 64 | return self._node_to_cubes(surrounding) 65 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/python 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python 3 | VERSION 4 | 5 | .idea/ 6 | .vscode/ 7 | 8 | *~ 9 | *.jar 10 | 11 | ### Python ### 12 | # Byte-compiled / optimized / DLL files 13 | __pycache__/ 14 | *.py[cod] 15 | *$py.class 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Distribution / packaging 21 | .Python 22 | build/ 23 | develop-eggs/ 24 | dist/ 25 | downloads/ 26 | eggs/ 27 | .eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | wheels/ 34 | pip-wheel-metadata/ 35 | share/python-wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | MANIFEST 40 | 41 | # PyInstaller 42 | # Usually these files are written by a python script from a template 43 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 44 | *.manifest 45 | *.spec 46 | 47 | # Installer logs 48 | pip-log.txt 49 | pip-delete-this-directory.txt 50 | 51 | # Unit test / coverage reports 52 | htmlcov/ 53 | .tox/ 54 | .nox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *.cover 61 | *.py,cover 62 | .hypothesis/ 63 | .pytest_cache/ 64 | pytestdebug.log 65 | 66 | # Translations 67 | *.mo 68 | *.pot 69 | 70 | # Django stuff: 71 | *.log 72 | local_settings.py 73 | db.sqlite3 74 | db.sqlite3-journal 75 | 76 | # Flask stuff: 77 | instance/ 78 | .webassets-cache 79 | 80 | # Scrapy stuff: 81 | .scrapy 82 | 83 | # Sphinx documentation 84 | docs/_build/ 85 | doc/_build/ 86 | 87 | # PyBuilder 88 | target/ 89 | 90 | # Jupyter Notebook 91 | .ipynb_checkpoints 92 | 93 | # IPython 94 | profile_default/ 95 | ipython_config.py 96 | 97 | # pyenv 98 | # .python-version 99 | 100 | # pipenv 101 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 102 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 103 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 104 | # install all needed dependencies. 105 | #Pipfile.lock 106 | 107 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 108 | __pypackages__/ 109 | 110 | # Celery stuff 111 | celerybeat-schedule 112 | celerybeat.pid 113 | 114 | # SageMath parsed files 115 | *.sage.py 116 | 117 | # Environments 118 | .env 119 | .venv 120 | env/ 121 | venv/ 122 | ENV/ 123 | env.bak/ 124 | venv.bak/ 125 | pythonenv* 126 | 127 | # Spyder project settings 128 | .spyderproject 129 | .spyproject 130 | 131 | # Rope project settings 132 | .ropeproject 133 | 134 | # mkdocs documentation 135 | /site 136 | 137 | # mypy 138 | .mypy_cache/ 139 | .dmypy.json 140 | dmypy.json 141 | 142 | # Pyre type checker 143 | .pyre/ 144 | 145 | # pytype static type analyzer 146 | .pytype/ 147 | 148 | # profiling data 149 | .prof 150 | 151 | # End of https://www.toptal.com/developers/gitignore/api/python 152 | 153 | # macOS stuff 154 | .DS_store 155 | 156 | # File ONNX 157 | *.onnx 158 | 159 | # Local stuff 160 | dummy/ 161 | tmp_model/ 162 | plots/ 163 | demo/ 164 | -------------------------------------------------------------------------------- /psyke/extraction/cart/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 4 | 5 | from psyke.extraction import PedagogicalExtractor 6 | from psyke import get_default_random_seed 7 | from psyke.extraction.cart.FairTree import FairTreeClassifier, FairTreeRegressor 8 | from psyke.schema import DiscreteFeature, Value 9 | from tuprolog.theory import Theory 10 | from typing import Iterable, Any 11 | import pandas as pd 12 | 13 | 14 | TREE_SEED = get_default_random_seed() 15 | 16 | LeafConstraints = dict[str, list[Value]] 17 | LeafSequence = Iterable[tuple[LeafConstraints, Any]] 18 | 19 | 20 | class Cart(PedagogicalExtractor, ABC): 21 | 22 | def __init__(self, predictor, max_depth: int = 3, max_leaves: int = None, max_features=None, 23 | discretization: Iterable[DiscreteFeature] = None, 24 | normalization=None, simplify: bool = True): 25 | from psyke.extraction.cart.CartPredictor import CartPredictor 26 | 27 | super().__init__(predictor, discretization, normalization) 28 | self.is_fair = None 29 | self._cart_predictor = CartPredictor(discretization=discretization, normalization=normalization) 30 | self.depth = max_depth 31 | self.leaves = max_leaves 32 | self.max_features = max_features 33 | self._simplify = simplify 34 | 35 | def _extract(self, data: pd.DataFrame) -> Theory: 36 | from psyke.extraction.cart.FairTreePredictor import FairTreePredictor 37 | 38 | if self.is_fair: 39 | self._cart_predictor = FairTreePredictor(discretization=self.discretization, 40 | normalization=self.normalization) 41 | fair_tree = FairTreeClassifier if isinstance(data.iloc[0, -1], str) else FairTreeRegressor 42 | self._cart_predictor.predictor = fair_tree(max_depth=self.depth, max_leaves=self.leaves, 43 | protected_attr=self.is_fair) 44 | else: 45 | tree = DecisionTreeClassifier if isinstance(data.iloc[0, -1], str) else DecisionTreeRegressor 46 | self._cart_predictor.predictor = tree(random_state=TREE_SEED, max_depth=self.depth, 47 | max_leaf_nodes=self.leaves, max_features=self.max_features) 48 | self._cart_predictor.predictor.fit(data.iloc[:, :-1], data.iloc[:, -1]) 49 | return self._cart_predictor.create_theory(data, self._simplify) 50 | 51 | def make_fair(self, features: Iterable[str]): 52 | self.is_fair = features 53 | 54 | def _predict(self, dataframe: pd.DataFrame) -> Iterable: 55 | return self._cart_predictor.predict(dataframe) 56 | 57 | def predict_why(self, data: dict[str, float], verbose=True): 58 | prediction = None 59 | conditions = {} 60 | if self.normalization is not None: 61 | data = {k: v * self.normalization[k][1] + self.normalization[k][0] if k in self.normalization else v 62 | for k, v in data.items()} 63 | for conditions, prediction in self._cart_predictor: 64 | if all(all(interval.is_in(data[variable]) for interval in intervals) 65 | for variable, intervals in conditions.items()): 66 | break 67 | return prediction, conditions 68 | 69 | @property 70 | def n_rules(self) -> int: 71 | return self._cart_predictor.n_leaves 72 | -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/gridex/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from itertools import product 3 | from typing import Iterable 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.base import ClassifierMixin 7 | from tuprolog.theory import Theory 8 | from psyke import get_default_random_seed 9 | from psyke.utils import Target 10 | from psyke.extraction.hypercubic import HyperCubeExtractor, Grid, HyperCube 11 | 12 | 13 | class GridEx(HyperCubeExtractor): 14 | """ 15 | Explanator implementing GridEx algorithm, doi:10.1007/978-3-030-82017-6_2. 16 | """ 17 | 18 | def __init__(self, predictor, grid: Grid, min_examples: int, threshold: float, output: Target = Target.CONSTANT, 19 | discretization=None, normalization=None, seed: int = get_default_random_seed()): 20 | super().__init__(predictor, Target.CLASSIFICATION if isinstance(predictor, ClassifierMixin) else output, 21 | discretization, normalization) 22 | self.grid = grid 23 | self.min_examples = min_examples 24 | self.threshold = threshold 25 | np.random.seed(seed) 26 | 27 | def _extract(self, dataframe: pd.DataFrame) -> Theory: 28 | self._hypercubes = [] 29 | self._surrounding = HyperCube.create_surrounding_cube(dataframe, output=self._output) 30 | self._surrounding.init_diversity(2 * self.threshold) 31 | self._iterate(dataframe) 32 | return self._create_theory(dataframe) 33 | 34 | def _create_ranges(self, cube, iteration): 35 | ranges = {} 36 | for (feature, (a, b)) in cube.dimensions.items(): 37 | n_bins = self.grid.get(feature, iteration) 38 | if n_bins == 1: 39 | ranges[feature] = [(a, b)] 40 | self._dimensions_to_ignore.add(feature) 41 | else: 42 | size = (b - a) / n_bins 43 | ranges[feature] = [(a + size * i, a + size * (i + 1)) for i in range(n_bins)] 44 | return ranges 45 | 46 | def _cubes_to_split(self, cube, iteration, dataframe, fake, keep_empty=False): 47 | to_split = [] 48 | for p in product(*self._create_ranges(cube, iteration).values()): 49 | cube = self._default_cube() 50 | for i, f in enumerate(dataframe.columns[:-1]): 51 | cube.update_dimension(f, p[i]) 52 | n = cube.count(dataframe) 53 | if n > 0 or keep_empty: 54 | fake = pd.concat([fake, cube.create_samples(self.min_examples - n)]) 55 | cube.update(fake, self.predictor) 56 | to_split.append(cube) 57 | return to_split, fake 58 | 59 | def _iterate(self, dataframe: pd.DataFrame): 60 | fake = dataframe.copy() 61 | prev = [self._surrounding] 62 | 63 | for iteration in self.grid.iterate(): 64 | next_iteration = [] 65 | for cube in prev: 66 | if cube.count(dataframe) == 0: 67 | continue 68 | if cube.diversity < self.threshold: 69 | self._hypercubes.append(cube) 70 | continue 71 | to_split, fake = self._cubes_to_split(cube, iteration, dataframe, fake) 72 | next_iteration.extend(self._merge(to_split, fake)) 73 | prev = next_iteration 74 | self._hypercubes.extend(prev) 75 | 76 | def make_fair(self, features: Iterable[str]): 77 | self.grid.make_fair(features) 78 | -------------------------------------------------------------------------------- /.github/workflows/check.yml: -------------------------------------------------------------------------------- 1 | name: check 2 | on: 3 | push: 4 | tags: '*' 5 | branches-ignore: 6 | - 'autodelivery**' 7 | - 'bump-**' 8 | - 'dependabot/**' 9 | paths-ignore: 10 | - 'CHANGELOG.md' 11 | - 'renovate.json' 12 | - '.gitignore' 13 | pull_request: 14 | workflow_dispatch: 15 | env: 16 | PROJECT_NAME: psyke-python 17 | WORKFLOW: check 18 | TEST_SUBMODULE: psykei/psyke-pytest 19 | jobs: 20 | create-test-predictors-if-needed: 21 | runs-on: ubuntu-latest 22 | name: Create test predictors if needed 23 | # TODO: short circuit job as soon as it's possible: 24 | # https://github.com/actions/runner/issues/662 25 | # if: ${{ github.repository == 'psykei/psyke-python' }} 26 | steps: 27 | - name: Checkout code 28 | if: ${{ github.repository == 'psykei/psyke-python' }} 29 | uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6 30 | with: 31 | fetch-depth: 0 32 | submodules: recursive 33 | 34 | - name: Get Python Version 35 | if: ${{ github.repository == 'psykei/psyke-python' }} 36 | id: get-python-version 37 | run: echo ::set-output name=version::$(cat .python-version) 38 | 39 | - name: Setup Python 40 | if: ${{ github.repository == 'psykei/psyke-python' }} 41 | uses: actions/setup-python@v6 42 | with: 43 | python-version: ${{ steps.get-python-version.outputs.version }} 44 | 45 | - name: Restore Python dependencies 46 | if: ${{ github.repository == 'psykei/psyke-python' }} 47 | run: pip install -r requirements.txt 48 | 49 | # - name: Create missing predictors 50 | # if: ${{ github.repository == 'psykei/psyke-python' }} 51 | # run: python setup.py create_test_predictors 52 | 53 | - name: Submodule update 54 | if: ${{ github.repository == 'psykei/psyke-python' }} 55 | run: | 56 | pushd test/resources 57 | git config user.email "bot@noreply.github.com" 58 | git config user.name "CI bot" 59 | git remote set-url origin https://x-access-token:${{ secrets.TRIGGER_GITHUB_ACTION }}@github.com/${{ env.TEST_SUBMODULE }} 60 | (git add predictors/*.onnx tests/*.csv datasets/*.csv) || echo 'nothing to add' 61 | (git commit -m 'predictors update from workflows') || echo 'nothing to commit' 62 | (git push) || echo 'nothing to push' 63 | run-unit-tests: 64 | strategy: 65 | fail-fast: false 66 | matrix: 67 | os: 68 | - ubuntu-latest 69 | - windows-latest 70 | - macos-latest 71 | python-version: 72 | - '3.11.0' 73 | runs-on: ${{ matrix.os }} 74 | name: Run tests on Python ${{ matrix.python-version }}, on ${{ matrix.os }} 75 | timeout-minutes: 45 76 | concurrency: 77 | group: ${{ github.workflow }}-run-unit-tests-${{ matrix.python-version }}-${{ matrix.os }}-${{ github.event.number || github.ref }} 78 | cancel-in-progress: true 79 | needs: 80 | - create-test-predictors-if-needed 81 | steps: 82 | - name: Setup Python 83 | uses: actions/setup-python@v6 84 | with: 85 | python-version: ${{ matrix.python-version }} 86 | 87 | - name: Checkout code 88 | uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6 89 | with: 90 | fetch-depth: 0 91 | submodules: recursive 92 | 93 | - name: Restore Python dependencies 94 | run: pip install -r requirements.txt 95 | 96 | - name: Test 97 | run: python -m unittest discover -s test -t . 98 | -------------------------------------------------------------------------------- /test/unit/extraction/trepan/test_node.py: -------------------------------------------------------------------------------- 1 | from psyke.extraction.trepan import Node 2 | from test import get_dataset 3 | import pandas as pd 4 | import unittest 5 | 6 | 7 | class TestNode(unittest.TestCase): 8 | 9 | dataset: pd.DataFrame = get_dataset('iris') 10 | n_examples = dataset.shape[0] 11 | all_node = Node(dataset, n_examples) 12 | setosa_40 = Node(dataset.iloc[10:70, :], n_examples) 13 | virginica_10 = Node(dataset.iloc[95:110, :], n_examples) 14 | versicolor_50 = Node(dataset.iloc[20:130, :], n_examples) 15 | 16 | def test_reach(self): 17 | node = Node(self.dataset, self.n_examples) 18 | self.assertEqual(node.reach, self.all_node.reach) 19 | self.assertTrue(self.virginica_10.reach < self.setosa_40.reach) 20 | self.assertTrue(self.setosa_40.reach < self.versicolor_50.reach) 21 | self.assertTrue(self.versicolor_50.reach < self.all_node.reach) 22 | 23 | def test_dominant(self): 24 | self.assertEqual('setosa', self.setosa_40.dominant) 25 | self.assertEqual('virginica', self.virginica_10.dominant) 26 | self.assertEqual('versicolor', self.versicolor_50.dominant) 27 | 28 | def test_correct(self): 29 | self.assertEqual(50, self.versicolor_50.correct) 30 | self.assertEqual(40, self.setosa_40.correct) 31 | self.assertEqual(10, self.virginica_10.correct) 32 | 33 | def test_fidelity(self): 34 | self.assertEqual(50 / 150, self.all_node.fidelity) 35 | self.assertEqual(40 / 60, self.setosa_40.fidelity) 36 | self.assertEqual(10 / 15, self.virginica_10.fidelity) 37 | self.assertEqual(50 / 110, self.versicolor_50.fidelity) 38 | 39 | def test_priority(self): 40 | self.assertTrue(self.all_node.priority < self.versicolor_50.priority) 41 | self.assertTrue(self.versicolor_50.priority < self.setosa_40.priority) 42 | self.assertTrue(self.setosa_40.priority < self.virginica_10.priority) 43 | 44 | def test_n_classes(self): 45 | self.assertEqual(3, self.all_node.n_classes) 46 | self.assertEqual(2, self.virginica_10.n_classes) 47 | self.assertEqual(2, self.setosa_40.n_classes) 48 | self.assertEqual(3, self.versicolor_50.n_classes) 49 | self.assertEqual(1, Node(self.dataset.iloc[15:40, :], self.n_examples).n_classes) 50 | 51 | def test_iterator(self): 52 | node = Node(self.dataset, self.n_examples) 53 | child_1 = Node(self.dataset.iloc[:50, :], self.n_examples) 54 | child_2 = Node(self.dataset.iloc[50:150, :], self.n_examples) 55 | node.children = [child_1, child_2] 56 | grandchild_1_1 = Node(self.dataset.iloc[:25, :], self.n_examples) 57 | grandchild_2_1 = Node(self.dataset.iloc[50:80, :], self.n_examples) 58 | grandchild_2_2 = Node(self.dataset.iloc[80:120, :], self.n_examples) 59 | child_1.children = [grandchild_1_1] 60 | child_2.children = [grandchild_2_1, grandchild_2_2] 61 | self.assertEqual(list(node), list(child_1) + list(child_2) + [node]) 62 | self.assertEqual([grandchild_1_1, child_1, grandchild_2_1, grandchild_2_2, child_2, node], list(node)) 63 | 64 | def test_to_string(self): 65 | node = Node(self.dataset, self.n_examples, (('V1', 0.0), ('V2', 1.0))) 66 | self.assertEqual(' = setosa', str(self.all_node)) 67 | self.assertEqual(' = setosa', str(self.setosa_40)) 68 | self.assertEqual(' = versicolor', str(self.versicolor_50)) 69 | self.assertEqual(' = virginica', str(self.virginica_10)) 70 | self.assertEqual('!V1, V2 = setosa', str(node)) 71 | 72 | 73 | if __name__ == '__main__': 74 | unittest.main() 75 | -------------------------------------------------------------------------------- /psyke/tuning/orchid/__init__.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from psyke import Clustering, EvaluableModel 7 | from psyke.tuning import Optimizer, IterativeOptimizer 8 | from psyke.utils import Target 9 | 10 | 11 | class OrCHiD(IterativeOptimizer): 12 | class Algorithm(Enum): 13 | ExACT = 1, 14 | CREAM = 2 15 | 16 | def __init__(self, dataframe: pd.DataFrame, algorithm, output: Target = Target.CONSTANT, 17 | max_error_increase: float = 1.2, min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, 18 | patience: int = 5, max_depth: int = 10, gauss_components=10, normalization=None, discretization=None): 19 | super().__init__(dataframe, max_error_increase, min_rule_decrease, readability_tradeoff, max_depth, patience, 20 | output, normalization, discretization) 21 | self.algorithm = algorithm 22 | self.gauss_components = gauss_components 23 | 24 | def search(self): 25 | self.params = self.__search_depth() 26 | 27 | def __search_depth(self): 28 | params, best = [], None 29 | 30 | for depth in range(1, self.max_depth + 1): 31 | current_params = self.__search_threshold(depth) 32 | current_best = self._best(current_params)[1] 33 | print() 34 | best, to_break = self._check_iteration_improvement(best, current_best) 35 | params += current_params 36 | 37 | if len(params) > 1 and to_break: 38 | break 39 | return params 40 | 41 | def __search_threshold(self, depth): 42 | step = 1.0 43 | threshold = 1.0 44 | params = [] 45 | patience = self.patience 46 | while patience > 0: 47 | print(f"{self.algorithm}. Depth: {depth}. Threshold = {threshold:.2f}. " 48 | f"Gaussian components = {self.gauss_components}. ", end="") 49 | clustering = (Clustering.cream if self.algorithm == OrCHiD.Algorithm.CREAM else Clustering.exact)( 50 | depth=depth, error_threshold=threshold, gauss_components=self.gauss_components, output=self.output 51 | ) 52 | clustering.fit(self.dataframe) 53 | task, metric = \ 54 | (EvaluableModel.Task.CLASSIFICATION, EvaluableModel.ClassificationScore.INVERSE_ACCURACY) \ 55 | if self.output == Target.CLASSIFICATION else \ 56 | (EvaluableModel.Task.REGRESSION, EvaluableModel.RegressionScore.MAE) 57 | p, n = clustering.score(self.dataframe, None, False, False, task=task, 58 | scoring_function=[metric])[metric][0], clustering.n_rules 59 | 60 | print(f"Predictive loss = {p:.2f}, {n} rules") 61 | 62 | if len(params) == 0: 63 | params.append((p, n, depth, threshold)) 64 | threshold = p / 20 65 | step = p / self.patience * 0.75 66 | continue 67 | 68 | if (n == 1) or (p == 0.0): 69 | params.append((p, n, depth, threshold)) 70 | break 71 | 72 | if p > params[0][0] * self.max_error_increase: 73 | break 74 | 75 | improvement = (params[-1][0] / p) + (1 - n / params[-1][1]) 76 | 77 | if improvement <= 1 or n > np.ceil(params[-1][1] * self.min_rule_decrease): 78 | patience -= 1 79 | if p != params[-1][0] or n != params[-1][1]: 80 | params.append((p, n, depth, threshold)) 81 | threshold += step 82 | return params 83 | 84 | def _print_params(self, name, params): 85 | print("*" * 40) 86 | print(f"* Best {name}") 87 | print("*" * 40) 88 | print(f"* Predictive loss = {params[0]:.2f}, {params[1]} rules") 89 | print(f"* Threshold = {params[3]:.2f}") 90 | print(f"* Depth = {params[2]}") 91 | print("*" * 40) 92 | -------------------------------------------------------------------------------- /psyke/tuning/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from enum import Enum 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from psyke.extraction.hypercubic import Grid 7 | from psyke.utils import Target 8 | 9 | 10 | class Objective(Enum): 11 | MODEL = 1, 12 | DATA = 2 13 | 14 | 15 | class Optimizer: 16 | def __init__(self, dataframe: pd.DataFrame, output: Target = Target.CONSTANT, max_error_increase: float = 1.2, 17 | min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, patience: int = 5, 18 | normalization=None, discretization=None): 19 | self.dataframe = dataframe 20 | self.output = output 21 | self.max_error_increase = max_error_increase 22 | self.min_rule_decrease = min_rule_decrease 23 | self.readability_tradeoff = readability_tradeoff 24 | self.patience = patience 25 | self.params = None 26 | self.normalization = normalization 27 | self.discretization = discretization 28 | 29 | def search(self): 30 | raise NotImplementedError 31 | 32 | def _best(self, params): 33 | param_dict = {self._score(t): t for t in params} 34 | min_param = min(param_dict) 35 | return min_param, param_dict[min_param] 36 | 37 | def _score(self, param): 38 | return param[0] * np.ceil(param[1] * self.readability_tradeoff) 39 | 40 | def _best_param(self, param): 41 | param_dict = {t[param]: t for t in self.params} 42 | min_param = min(param_dict) 43 | return min_param, param_dict[min_param] 44 | 45 | def get_best(self): 46 | names = ["Combined", "Predictive loss", "N rules"] 47 | params = [self._best(self.params), self._best_param(0), self._best_param(1)] 48 | for n, p in zip(names, params): 49 | self._print_params(n, p[1]) 50 | print() 51 | return self._best(self.params)[1], self._best_param(0)[1], self._best_param(1)[1] 52 | 53 | def _print_params(self, n, param): 54 | raise NotImplementedError 55 | 56 | 57 | class SKEOptimizer(Optimizer, ABC): 58 | def __init__(self, predictor, dataframe: pd.DataFrame, max_error_increase: float = 1.2, 59 | min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, patience: int = 5, 60 | objective: Objective = Objective.MODEL, output: Target = Target.CONSTANT, 61 | normalization=None, discretization=None): 62 | super().__init__(dataframe, output, max_error_increase, min_rule_decrease, readability_tradeoff, 63 | patience, normalization, discretization) 64 | self.predictor = predictor 65 | self.objective = objective 66 | 67 | 68 | class IterativeOptimizer(Optimizer, ABC): 69 | def __init__(self, dataframe: pd.DataFrame, max_error_increase: float = 1.2, 70 | min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, max_depth: int = 10, 71 | patience: int = 5, output: Target = Target.CONSTANT, normalization=None, discretization=None): 72 | super().__init__(dataframe, output, max_error_increase, min_rule_decrease, readability_tradeoff, 73 | patience, normalization, discretization) 74 | self.max_depth = max_depth 75 | 76 | def _iteration_improvement(self, best, other): 77 | if other[0] == best[0]: 78 | return (best[1] - other[1]) * 2 79 | return 1 / ( 80 | (1 - other[0] / best[0]) ** self.readability_tradeoff * 81 | np.ceil(other[1] / self.readability_tradeoff) / np.ceil(best[1] / self.readability_tradeoff) 82 | ) 83 | 84 | def _check_iteration_improvement(self, best, current): 85 | improvement = \ 86 | self._iteration_improvement([best[0], best[1]], [current[0], current[1]]) if best is not None else np.inf 87 | if isinstance(improvement, complex): 88 | improvement = 1.0 89 | return current, improvement < 1.2 90 | -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/divine/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from tuprolog.theory import Theory 4 | 5 | from psyke import Target, get_default_random_seed 6 | from psyke.extraction.hypercubic import HyperCubeExtractor 7 | from psyke.extraction.hypercubic.hypercube import Point, GenericCube, HyperCube 8 | 9 | from sklearn.neighbors import BallTree 10 | 11 | 12 | class DiViNE(HyperCubeExtractor): 13 | """ 14 | Explanator implementing DiViNE algorithm. 15 | """ 16 | 17 | def __init__(self, predictor, k: int = 5, patience: int = 15, close_to_center: bool = True, 18 | discretization=None, normalization=None, seed: int = get_default_random_seed()): 19 | super().__init__(predictor, Target.CLASSIFICATION, discretization, normalization) 20 | self.k = k 21 | self.patience = patience 22 | self.vicinity_function = DiViNE.closest_to_center if close_to_center else DiViNE.closest_to_corners 23 | self.seed = seed 24 | 25 | @staticmethod 26 | def __pop(data: pd.DataFrame, idx: int = None) -> (Point, pd.DataFrame): 27 | if idx is None: 28 | idx = data.sample(1).index.values[0] 29 | t = data.T 30 | return DiViNE.__to_point(t.pop(idx)), t.T.reset_index(drop=True) 31 | 32 | @staticmethod 33 | def __to_point(instance) -> Point: 34 | point = Point(instance.index.values, instance.values) 35 | return point 36 | 37 | def __to_cube(self, point: Point) -> GenericCube: 38 | cube = HyperCube.cube_from_point(point.dimensions, self._output) 39 | cube._output = list(point.dimensions.values())[-1] 40 | return cube 41 | 42 | def __clean(self, data: pd.DataFrame) -> pd.DataFrame: 43 | _, idx = BallTree(data.iloc[:, :-1]).query(data.iloc[:, :-1], k=self.k) 44 | # how many output classes are associated with the k neighbors 45 | count = np.array(list(map(lambda indices: len(data.iloc[indices].iloc[:, -1].unique()), idx))) 46 | # instances with neighbors of different classes are discarded 47 | return data[count == 1] 48 | 49 | def __closest(self, data: pd.DataFrame, cube: GenericCube) -> (Point, pd.DataFrame): 50 | return DiViNE.__pop(data, self.vicinity_function(BallTree(data.iloc[:, :-1]), cube)) 51 | 52 | @staticmethod 53 | def closest_to_center(tree: BallTree, cube: GenericCube): 54 | return tree.query([list(cube.center.dimensions.values())], k=1)[1][0][-1] 55 | 56 | @staticmethod 57 | def closest_to_corners(tree: BallTree, cube: GenericCube): 58 | distance, idx = tree.query([list(point.dimensions.values()) for point in cube.corners()], k=1) 59 | return idx[np.argmin(distance)][-1] 60 | 61 | def _extract(self, dataframe: pd.DataFrame) -> Theory: 62 | self._surrounding = HyperCube.create_surrounding_cube(dataframe, output=Target.CLASSIFICATION) 63 | np.random.seed(self.seed) 64 | data = self.__clean(dataframe) 65 | 66 | while len(data) > 0: 67 | discarded = [] 68 | patience = self.patience 69 | point, data = self.__pop(data) 70 | cube = self.__to_cube(point) 71 | 72 | while patience > 0 and len(data) > 0: 73 | other, data = self.__closest(data, cube) 74 | if cube.output == list(other.dimensions.values())[-1]: 75 | cube = cube.merge_with_point(other) 76 | data = data[~(cube.filter_indices(data.iloc[:, :-1]))].reset_index(drop=True) 77 | else: 78 | patience -= 1 79 | discarded.append(other) 80 | if cube.volume() > 0: 81 | cube.update(dataframe, self.predictor) 82 | self._hypercubes.append(cube) 83 | if len(discarded) > 0: 84 | data = pd.concat([data] + [d.to_dataframe() for d in discarded]).reset_index(drop=True) 85 | self._sort_cubes() 86 | return self._create_theory(dataframe) 87 | -------------------------------------------------------------------------------- /psyke/genetic/fgin/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from psyke import Target 5 | from psyke.genetic.gin import GIn 6 | 7 | import skfuzzy as skf 8 | 9 | 10 | class FGIn(GIn): 11 | 12 | def __init__(self, train, valid, features, sigmas, slices, min_rules=1, poly=1, alpha=0.5, indpb=0.5, tournsize=3, 13 | metric='R2', output=Target.REGRESSION, warm=False): 14 | super().__init__(train, valid, features, sigmas, slices, min_rules, poly, alpha, indpb, tournsize, 15 | metric, output, warm) 16 | self.feature_to_idx = {f: i for i, f in enumerate(self.X.columns)} 17 | 18 | def _evaluate(self, individual=None): 19 | y_pred, valid_regions = self.__predict(individual or self.best, self.X if self.valid is None else self.valid[0]) 20 | if valid_regions < self.min_rules: 21 | return -9999, 22 | return self._score(self.y if self.valid is None else self.valid[1], y_pred), 23 | 24 | @staticmethod 25 | def __generate_membership(var, domain, thresholds, shape='tri'): 26 | th = [var.min()] + [min(max(t, var.min()), var.max()) for t in thresholds] + [var.max()] 27 | 28 | if shape == 'tri': 29 | mid = [(x1 + x2) / 2 for x1, x2 in zip(th[:-1], th[1:])] 30 | return [skf.trapmf(domain, [domain.min()] * 2 + mid[:2])] + \ 31 | [skf.trimf(domain, [x1, x2, x3]) for x1, x2, x3 in zip(mid[:-2], mid[1:-1], mid[2:])] + \ 32 | [skf.trapmf(domain, mid[-2:] + [domain.max()] * 2)] 33 | if shape == 'trap': 34 | beg = [None, domain.min()] + [(3 * x1 + x2) / 4 for x1, x2 in zip(th[1:-1], th[2:])] + [domain.max()] 35 | end = [domain.min()] + [(x1 + 3 * x2) / 4 for x1, x2 in zip(th[:-2], th[1:-1])] + [domain.max()] 36 | return [skf.trapmf(domain, [end[i - 1], beg[i], end[i], beg[i + 1]]) for i in range(1, len(th))] 37 | raise ValueError('Supported shape values are only \'tri\' and \'trap\'') 38 | 39 | @staticmethod 40 | def __extend_domain(x, q_low=0.05, q_high=0.95, p=0.05, k_sigma=2.0, abs_min_margin=0.0): 41 | ql, qh = np.quantile(x, [q_low, q_high]) 42 | margin = max(p * (qh - ql), k_sigma * np.std(x), abs_min_margin) 43 | return np.array([ql - margin, qh + margin]) 44 | 45 | def __get_activations(self, x, functions_domains, valid_masks): 46 | levels = [np.array([skf.interp_membership(domain, mf, x[index]) for mf in mfs]) 47 | for mfs, domain, index in functions_domains.values()] 48 | return np.prod(np.meshgrid(*levels, indexing='ij'), axis=0).ravel()[valid_masks] 49 | 50 | def __fuzzify(self, cuts): 51 | cuts = dict(zip(self.features, cuts)) 52 | doms = {c: FGIn.__extend_domain(self.X[c]) for c in self.features} 53 | return {c: (FGIn.__generate_membership(self.X[c], doms[c], cuts[c], 'trap'), doms[c], 54 | self.feature_to_idx[c]) for c in self.features} 55 | 56 | def __predict(self, individual=None, to_pred=None): 57 | cuts = self._get_cuts(individual or self.best) 58 | masks = np.array([self._region(to_pred, cuts) == r for r in range(np.prod([s + 1 for s in self.slices]))]) 59 | valid_masks = masks.sum(axis=1) >= 3 60 | 61 | masks = [mask for mask in masks if mask.sum() >= 3] 62 | functions_domains = self.__fuzzify(cuts) 63 | 64 | pred = np.array([self._output_estimation(mask, to_pred) for mask in masks]).T 65 | activations = np.array([self.__get_activations(x, functions_domains, valid_masks) for x in to_pred.values]) 66 | 67 | if self.output == Target.CLASSIFICATION: 68 | classes, idx = np.unique(pred, return_inverse=True) 69 | pred = classes[np.argmax(np.vstack([activations[:, idx == i].sum(axis=1) for i, c in enumerate(classes)]), 70 | axis=0)] 71 | else: 72 | pred = (pred * activations).sum(axis=1) 73 | 74 | return pd.DataFrame(pred, index=to_pred.index), len(masks) 75 | -------------------------------------------------------------------------------- /test/unit/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from sklearn.model_selection import train_test_split 3 | from tuprolog.solve.prolog import prolog_solver 4 | from psyke.extraction.hypercubic import Grid, FeatureRanker 5 | from psyke.utils.dataframe import get_discrete_dataset 6 | from psyke.utils.logic import data_to_struct, get_in_rule, get_not_in_rule 7 | from psyke.extraction.hypercubic.strategy import AdaptiveStrategy, FixedStrategy 8 | from test import get_dataset, get_extractor, get_schema, get_model 9 | from test.resources.tests import test_cases 10 | from tuprolog.theory import Theory, mutable_theory 11 | from tuprolog.theory.parsing import parse_theory 12 | from typing import Callable 13 | import ast 14 | import numpy as np 15 | from psyke import get_default_random_seed 16 | 17 | 18 | def initialize(file: str) -> list[dict[str:Theory]]: 19 | for row in test_cases(file): 20 | params = dict() if row['extractor_params'] == '' else ast.literal_eval(row['extractor_params']) 21 | dataset = get_dataset(row['dataset']) 22 | 23 | training_set, test_set = train_test_split(dataset, test_size=0.05 if row['dataset'].lower() == 'house' else 0.5, 24 | random_state=get_default_random_seed()) 25 | 26 | schema, test_set_for_predictor = None, test_set 27 | if 'disc' in row.keys() and bool(row['disc']): 28 | schema = get_schema(training_set) 29 | params['discretization'] = schema 30 | training_set = get_discrete_dataset(training_set.iloc[:, :-1], schema) \ 31 | .join(training_set.iloc[:, -1].reset_index(drop=True)) 32 | test_set_for_predictor = get_discrete_dataset(test_set.iloc[:, :-1], schema) \ 33 | .join(test_set.iloc[:, -1].reset_index(drop=True)) 34 | 35 | # Handle Cart tests. 36 | # Cart needs to inspect the tree of the predictor. 37 | # Unfortunately onnx does not provide a method to do that. 38 | #if row['predictor'].lower() not in ['dtc', 'dtr']: 39 | # params['predictor'] = Predictor.load_from_onnx(str(get_predictor_path(row['predictor']))) 40 | #else: 41 | predictor, fitted = get_model(row['predictor'], {}) 42 | if not fitted: 43 | predictor.fit(training_set.iloc[:, :-1], training_set.iloc[:, -1]) 44 | params['predictor'] = predictor 45 | 46 | # Handle GridEx tests 47 | # TODO: this is algorithm specific therefore it should be handled inside the algorithm itself. 48 | if 'grid' in row.keys() and bool: 49 | strategy, n = eval(row['strategies']) 50 | if strategy == "F": 51 | params['grid'] = Grid(int(row['grid']), FixedStrategy(n)) 52 | else: 53 | ranked = FeatureRanker(training_set.columns[:-1]) \ 54 | .fit(params['predictor'], training_set.iloc[:, :-1]).rankings() 55 | params['grid'] = Grid(int(row['grid']), AdaptiveStrategy(ranked, n)) 56 | 57 | extractor = get_extractor(row['extractor_type'], params) 58 | theory = extractor.extract(training_set) 59 | 60 | # Compute predictions from rules 61 | index = test_set.shape[1] - 1 62 | 63 | cast, substitutions = get_substitutions(test_set, theory) 64 | expected = [cast(query.solved_query.get_arg_at(index)) for query in substitutions if query.is_yes] 65 | predictions = [prediction for prediction in extractor.predict(test_set_for_predictor.iloc[:, :-1]) 66 | if prediction is not None] 67 | 68 | yield { 69 | 'extractor': extractor, 70 | 'extracted_theory': theory, 71 | 'extracted_test_y_from_theory': np.array(expected), 72 | 'extracted_test_y_from_extractor': np.array(predictions), 73 | 'test_set': test_set, 74 | 'expected_theory': parse_theory(row['theory'] + '.') if row['theory'] != '' else None, 75 | 'discretization': schema 76 | } 77 | 78 | 79 | def get_substitutions(test_set, theory): 80 | cast: Callable = lambda x: (str(x) if isinstance(test_set.iloc[0, -1], str) else float(x.value)) 81 | solver = prolog_solver(static_kb=mutable_theory(theory).assertZ(get_in_rule()).assertZ(get_not_in_rule())) 82 | substitutions = [solver.solveOnce(data_to_struct(data)) for _, data in test_set.iterrows()] 83 | return cast, substitutions 84 | -------------------------------------------------------------------------------- /psyke/hypercubepredictor.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Iterable 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.neighbors import BallTree 8 | 9 | from psyke import EvaluableModel, Target, get_int_precision 10 | from psyke.extraction.hypercubic import RegressionCube, GenericCube, Point 11 | 12 | 13 | class HyperCubePredictor(EvaluableModel): 14 | def __init__(self, output=Target.CONSTANT, discretization=None, normalization=None): 15 | super().__init__(discretization, normalization) 16 | self._hypercubes = [] 17 | self._dimensions_to_ignore = set() 18 | self._output = output 19 | self._surrounding = None 20 | 21 | def _predict(self, dataframe: pd.DataFrame) -> Iterable: 22 | return np.array([self._predict_from_cubes(row.to_dict()) for _, row in dataframe.iterrows()]) 23 | 24 | def _brute_predict(self, dataframe: pd.DataFrame, criterion: str = 'corner', n: int = 2) -> Iterable: 25 | predictions = np.array(self._predict(dataframe)) 26 | idx = [prediction is None for prediction in predictions] 27 | if sum(idx) > 0: 28 | if criterion == 'default': 29 | predictions[idx] = np.array([HyperCubePredictor._get_cube_output( 30 | self._surrounding, row 31 | ) for _, row in dataframe[idx].iterrows()]) 32 | elif criterion == 'surface': 33 | predictions[idx] = np.array([HyperCubePredictor._get_cube_output(self._brute_predict_surface(row), row) 34 | for _, row in dataframe[idx].iterrows()]) 35 | else: 36 | tree, cubes = self._create_brute_tree(criterion, n) 37 | predictions[idx] = np.array([HyperCubePredictor._brute_predict_from_cubes( 38 | row.to_dict(), tree, cubes 39 | ) for _, row in dataframe[idx].iterrows()]) 40 | return np.array(predictions) 41 | 42 | @staticmethod 43 | def _brute_predict_from_cubes(row: dict[str, float], tree: BallTree, 44 | cubes: list[GenericCube]) -> float | str: 45 | idx = tree.query([list(row.values())], k=1)[1][0][0] 46 | return HyperCubePredictor._get_cube_output(cubes[idx], row) 47 | 48 | def _brute_predict_surface(self, row: pd.Series) -> GenericCube: 49 | return min([( 50 | cube.surface_distance(Point(list(row.keys()), list(row.values))), cube.volume(), cube 51 | ) for cube in self._hypercubes])[-1] 52 | 53 | def _create_brute_tree(self, criterion: str = 'center', n: int = 2) -> (BallTree, list[GenericCube]): 54 | admissible_criteria = ['surface', 'center', 'corner', 'perimeter', 'density', 'default'] 55 | if criterion not in admissible_criteria: 56 | raise NotImplementedError( 57 | "'criterion' should be chosen in " + str(admissible_criteria) 58 | ) 59 | 60 | points = [(cube.center, cube) for cube in self._hypercubes] if criterion == 'center' else \ 61 | [(cube.barycenter, cube) for cube in self._hypercubes] if criterion == 'density' else \ 62 | [(corner, cube) for cube in self._hypercubes for corner in cube.corners()] if criterion == 'corner' else \ 63 | [(point, cube) for cube in self._hypercubes for point in cube.perimeter_samples(n)] \ 64 | if criterion == 'perimeter' else None 65 | 66 | return BallTree(pd.concat([point[0].to_dataframe() for point in points], ignore_index=True)), \ 67 | [point[1] for point in points] 68 | 69 | def _predict_from_cubes(self, data: dict[str, float]) -> float | str | None: 70 | cube = self._find_cube(data) 71 | if cube is None: 72 | return None 73 | elif self._output == Target.CLASSIFICATION: 74 | return HyperCubePredictor._get_cube_output(cube, data) 75 | else: 76 | return round(HyperCubePredictor._get_cube_output(cube, data), get_int_precision()) 77 | 78 | def _find_cube(self, data: dict[str, float]) -> GenericCube | None: 79 | if not self._hypercubes: 80 | return None 81 | data = data.copy() 82 | for dimension in self._dimensions_to_ignore: 83 | if dimension in data: 84 | del data[dimension] 85 | for cube in self._hypercubes: 86 | if data in cube: 87 | return cube.copy() 88 | if self._hypercubes[-1].is_default: 89 | return self._hypercubes[-1].copy() 90 | 91 | @property 92 | def n_rules(self): 93 | return len(list(self._hypercubes)) 94 | 95 | @property 96 | def volume(self): 97 | return sum([cube.volume() for cube in self._hypercubes]) 98 | 99 | @staticmethod 100 | def _get_cube_output(cube, data: dict[str, float]) -> float: 101 | return cube.output.predict(pd.DataFrame([data])).flatten()[0] if \ 102 | isinstance(cube, RegressionCube) else cube.output 103 | -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/ginger/__init__.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from typing import Iterable 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.base import ClassifierMixin 7 | from sklearn.preprocessing import PolynomialFeatures 8 | from tuprolog.theory import Theory 9 | 10 | from psyke import get_default_random_seed, Target 11 | from psyke.extraction.hypercubic import HyperCubeExtractor, HyperCube, RegressionCube 12 | 13 | from deap import base, creator 14 | 15 | from psyke.genetic.gin import GIn 16 | 17 | 18 | class GInGER(HyperCubeExtractor): 19 | """ 20 | Explanator implementing GInGER algorithm. 21 | """ 22 | 23 | def __init__(self, predictor, features, sigmas, max_slices, min_rules=1, max_poly=1, alpha=0.5, indpb=0.5, 24 | tournsize=3, metric='R2', n_gen=50, n_pop=50, threshold=None, valid=None, 25 | output: Target = Target.REGRESSION, normalization=None, seed: int = get_default_random_seed()): 26 | super().__init__(predictor, output=Target.CLASSIFICATION if isinstance(predictor, ClassifierMixin) else output, 27 | normalization=normalization) 28 | self.threshold = threshold 29 | np.random.seed(seed) 30 | 31 | self.features = features 32 | self.max_features = len(features) 33 | self.sigmas = sigmas 34 | self.max_slices = max_slices 35 | self.min_rules = min_rules 36 | self.poly = max_poly 37 | self.trained_poly = None 38 | 39 | self.alpha = alpha 40 | self.indpb = indpb 41 | self.tournsize = tournsize 42 | self.metric = metric 43 | 44 | self.n_gen = n_gen 45 | self.n_pop = n_pop 46 | self.valid = valid 47 | 48 | creator.create("FitnessMax", base.Fitness, weights=(1.0,)) 49 | creator.create("Individual", list, fitness=creator.FitnessMax) 50 | 51 | def __poly_names(self): 52 | return [''.join(['' if pp == 0 else f'{n} * ' if pp == 1 else f'{n}**{pp} * ' 53 | for pp, n in zip(p, self.trained_poly.feature_names_in_)])[:-3] 54 | for p in self.trained_poly.powers_] 55 | 56 | def _predict(self, dataframe: pd.DataFrame) -> Iterable: 57 | dataframe = pd.DataFrame(self.trained_poly.fit_transform(dataframe), columns=self.__poly_names()) 58 | return np.array([self._predict_from_cubes(row.to_dict()) for _, row in dataframe.iterrows()]) 59 | 60 | def _extract(self, dataframe: pd.DataFrame) -> Theory: 61 | best = {} 62 | for poly in range(self.poly): 63 | for slices in list(itertools.product(range(1, self.max_slices + 1), repeat=self.max_features)): 64 | gr = GIn((dataframe.iloc[:, :-1], dataframe.iloc[:, -1]), self.valid, self.features, self.sigmas, 65 | slices, min_rules=self.min_rules, poly=poly + 1, alpha=self.alpha, indpb=self.indpb, 66 | tournsize=self.tournsize, metric=self.metric, output=self._output, warm=True) 67 | 68 | b, score, _, _ = gr.run(n_gen=self.n_gen, n_pop=self.n_pop) 69 | best[(score, poly + 1, slices)] = b 70 | m = min(best) 71 | poly, slices, best = m[1], m[2], best[m] 72 | self.trained_poly = PolynomialFeatures(degree=poly, include_bias=False) 73 | transformed = pd.DataFrame(self.trained_poly.fit_transform(dataframe.iloc[:, :-1]), columns=self.__poly_names()) 74 | transformed[dataframe.columns[-1]] = dataframe.iloc[:, -1].values 75 | 76 | self._surrounding = HyperCube.create_surrounding_cube(transformed, output=self._output) 77 | 78 | cuts = [sorted(best[sum(slices[:i]):sum(slices[:i + 1])]) for i in range(len(slices))] 79 | 80 | intervals = [[(transformed[self.features[i]].min(), cut[0])] + 81 | [(cut[i], cut[i + 1]) for i in range(len(cut) - 1)] + 82 | [(cut[-1], transformed[self.features[i]].max())] for i, cut in enumerate(cuts)] 83 | 84 | hypercubes = [{f: iv for f, iv in zip(self.features, combo)} for combo in itertools.product(*intervals)] 85 | mi_ma = {f: (transformed[f].min(), transformed[f].max()) for f in transformed.columns if f not in self.features} 86 | self._hypercubes = [self._default_cube({feat: h[feat] if feat in self.features else mi_ma[feat] 87 | for feat in transformed.columns[:-1]}) for h in hypercubes] 88 | self._hypercubes = [c for c in self._hypercubes if c.count(transformed) >= 2] 89 | for c in self._hypercubes: 90 | for feature in transformed.columns: 91 | if feature not in self.features: 92 | for direction in ['+', '-']: 93 | c.set_infinite(feature, direction) 94 | c.update(transformed) 95 | if self.threshold is not None: 96 | self._hypercubes = self._merge(self._hypercubes, transformed) 97 | return self._create_theory(transformed) 98 | 99 | def make_fair(self, features: Iterable[str]): 100 | self._dimensions_to_ignore.update(features) 101 | -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/hex/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Iterable 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from psyke import get_default_random_seed, Target 9 | from psyke.extraction.hypercubic import Grid, HyperCube, GenericCube, ClassificationCube 10 | from psyke.extraction.hypercubic.gridex import GridEx 11 | 12 | 13 | class HEx(GridEx): 14 | """ 15 | Explanator implementing HEx algorithm. 16 | """ 17 | 18 | class Node: 19 | def __init__(self, cube: GenericCube, parent: HEx.Node = None, threshold: float = None): 20 | self.cube = cube 21 | self.parent = parent 22 | self.children: Iterable[HEx.Node] = [] 23 | self.threshold = threshold 24 | self.gain = True if parent is None else self.check() 25 | 26 | def check(self) -> bool: 27 | other = self.parent 28 | try: 29 | while not other.gain: 30 | other = other.parent 31 | except AttributeError: 32 | return True 33 | if isinstance(other.cube, ClassificationCube): 34 | return other.cube.output != self.cube.output 35 | return other.cube.error - self.cube.error > self.threshold * .6 36 | 37 | def indices(self, dataframe: pd.DataFrame): 38 | return self.cube.filter_indices(dataframe.iloc[:, :-1]) 39 | 40 | def eligible_children(self, dataframe) -> Iterable[HEx.Node]: 41 | return [c for c in self.children if c.cube.count(dataframe) > 0] 42 | 43 | def permanent_children(self, dataframe) -> Iterable[HEx.Node]: 44 | return [c for c in self.eligible_children(dataframe) if c.gain] 45 | 46 | def permanent_indices(self, dataframe): 47 | return np.any([c.cube.filter_indices(dataframe.iloc[:, :-1]) 48 | for c in self.eligible_children(dataframe) if c.gain], axis=0) 49 | 50 | def update(self, dataframe: pd.DataFrame, predictor, recursive=False): 51 | if recursive: 52 | for node in self.children: 53 | node.update(dataframe, predictor, recursive) 54 | cleaned = [(c.cube, c.gain) for c in self.eligible_children(dataframe)] 55 | idx = self.permanent_indices(dataframe) 56 | 57 | if sum(g for _, g in cleaned) > 0 and sum(self.indices(dataframe)) > sum(idx) and self.gain: 58 | self.cube.update(dataframe[self.indices(dataframe) & ~idx], predictor) 59 | return cleaned 60 | 61 | def linearize(self, dataframe, depth=1): 62 | children = [c.linearize(dataframe, depth + 1) for c in self.permanent_children(dataframe)] 63 | return [(cc, dd) for c in children for cc, dd in c if c != []] + \ 64 | [(c, depth) for c in self.permanent_children(dataframe)] 65 | 66 | def __init__(self, predictor, grid: Grid, min_examples: int, threshold: float, output: Target = Target.CONSTANT, 67 | discretization=None, normalization=None, seed: int = get_default_random_seed()): 68 | super().__init__(predictor, grid, min_examples, threshold, output, discretization, normalization, seed) 69 | self._default_surrounding_cube = True 70 | 71 | def _gain(self, parent_cube: GenericCube, new_cube: GenericCube) -> float: 72 | if isinstance(parent_cube, ClassificationCube): 73 | return parent_cube.output != new_cube.output 74 | return parent_cube.error - new_cube.error > self.threshold * .6 75 | 76 | def _iterate(self, dataframe: pd.DataFrame): 77 | fake = dataframe.copy() 78 | self._surrounding.update(dataframe, self.predictor) 79 | root = HEx.Node(self._surrounding, threshold=self.threshold) 80 | current = [root] 81 | 82 | for iteration in self.grid.iterate(): 83 | next_iteration = [] 84 | for node in current: 85 | if node.cube.diversity < self.threshold: 86 | continue 87 | children, fake = self._cubes_to_split(node.cube, iteration, dataframe, fake, True) 88 | node.children = [HEx.Node(c, node, threshold=self.threshold) for c in children] 89 | cleaned = node.update(fake, self.predictor, False) 90 | node.children = [HEx.Node(c, node, threshold=self.threshold) for c in self._merge( 91 | [c for c, _ in cleaned], fake)] 92 | next_iteration += [n for n in node.children] 93 | 94 | current = next_iteration.copy() 95 | _ = root.update(fake, self.predictor, True) 96 | self._hypercubes = [] 97 | linearized = root.linearize(fake) 98 | for depth in sorted(np.unique([d for (_, d) in linearized]), reverse=True): 99 | self._hypercubes += self._merge([c.cube for (c, d) in linearized if d == depth], fake) 100 | 101 | if len(self._hypercubes) == 0: 102 | self._hypercubes = [self._surrounding] 103 | elif not min(np.any([c.filter_indices(dataframe.iloc[:, :-1]) for c in self._hypercubes], axis=0)): 104 | self._hypercubes = self._hypercubes + [self._surrounding] 105 | -------------------------------------------------------------------------------- /psyke/clustering/exact/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC 4 | from collections import Counter 5 | from typing import Iterable, Union 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn.cluster import DBSCAN 10 | from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor 11 | 12 | from psyke.clustering import HyperCubeClustering 13 | from psyke.extraction.hypercubic import Node, ClosedCube, HyperCube 14 | from psyke.clustering.utils import select_gaussian_mixture, select_dbscan_epsilon 15 | from psyke.extraction.hypercubic.hypercube import ClosedRegressionCube, ClosedClassificationCube 16 | from psyke.utils import Target, get_default_random_seed 17 | 18 | 19 | class ExACT(HyperCubeClustering, ABC): 20 | """ 21 | Explanator implementing ExACT algorithm. 22 | """ 23 | 24 | def __init__(self, depth: int = 2, error_threshold: float = 0.1, output: Target = Target.CONSTANT, 25 | gauss_components: int = 2, discretization=None, normalization=None, 26 | seed: int = get_default_random_seed()): 27 | super().__init__(output, discretization, normalization) 28 | self.depth = depth 29 | self.error_threshold = error_threshold 30 | self.gauss_components = gauss_components 31 | self._predictor = KNeighborsClassifier() if output == Target.CLASSIFICATION else KNeighborsRegressor() 32 | self._predictor.n_neighbors = 1 33 | self.seed = seed 34 | 35 | def __eligible_cubes(self, gauss_pred: np.ndarray, node: Node, clusters: int): 36 | cubes = [] 37 | for i in range(len(np.unique(gauss_pred))): 38 | df = node.dataframe.iloc[np.where(gauss_pred == i)] 39 | if len(df) == 0: 40 | continue 41 | cubes.append(self._create_cube(df, clusters)) 42 | indices = [self._indices(cube, node.dataframe) for cube in cubes] 43 | return cubes, indices 44 | 45 | @staticmethod 46 | def _indices(cube: ClosedCube, data: pd.DataFrame) -> np.ndarray | None: 47 | indices = cube.filter_indices(data.iloc[:, :-1]) 48 | if len(data.iloc[indices]) * len(data.iloc[~indices]) == 0: 49 | return None 50 | return indices 51 | 52 | def _create_cube(self, dataframe: pd.DataFrame, clusters: int) -> ClosedCube: 53 | data = ExACT._remove_string_label(dataframe) 54 | dbscan_pred = DBSCAN(eps=select_dbscan_epsilon(data, clusters)).fit_predict(data.iloc[:, :-1]) 55 | return HyperCube.create_surrounding_cube( 56 | dataframe.iloc[np.where(dbscan_pred == Counter(dbscan_pred).most_common(1)[0][0])], 57 | True, self._output, self._protected_features 58 | ) 59 | 60 | def fit(self, dataframe: pd.DataFrame): 61 | np.random.seed(self.seed) 62 | self._predictor.fit(dataframe.iloc[:, :-1], dataframe.iloc[:, -1]) 63 | self._surrounding = HyperCube.create_surrounding_cube(dataframe, True, self._output, self._protected_features) 64 | self._hypercubes = self._iterate(Node(dataframe, self._surrounding)) 65 | 66 | def get_hypercubes(self) -> Iterable[HyperCube]: 67 | return list(self._hypercubes) 68 | 69 | def explain(self): 70 | for cube in self._hypercubes: 71 | print(f'Output is {cube.output} if:') 72 | for feature in cube.dimensions: 73 | lower, upper = cube[feature] 74 | print(f' {feature} is in [{lower:.2f}, {upper:.2f}]') 75 | 76 | @staticmethod 77 | def _remove_string_label(dataframe: pd.DataFrame): 78 | return dataframe.replace({dataframe.columns[-1]: {v: k for k, v in dict( 79 | enumerate(dataframe.iloc[:, -1].unique()) 80 | ).items()}}) if isinstance(dataframe.iloc[0, -1], str) else dataframe 81 | 82 | def _get_gauss_predictions(self, to_split): 83 | to_split.sort(reverse=True) 84 | (_, depth, _, node) = to_split.pop() 85 | data = ExACT._remove_string_label(node.dataframe) 86 | gauss_params = select_gaussian_mixture(data.drop(self._protected_features, axis=1), self.gauss_components) 87 | return node, depth, gauss_params[2].predict(data.drop(self._protected_features, axis=1)), gauss_params 88 | 89 | def _iterate(self, surrounding: Node) -> Iterable[HyperCube]: 90 | to_split = [(self.error_threshold * 10, 1, 1, surrounding)] 91 | while len(to_split) > 0: 92 | node, depth, gauss_pred, gauss_params = self._get_gauss_predictions(to_split) 93 | cubes, indices = self.__eligible_cubes(gauss_pred, node, gauss_params[1]) 94 | cubes = [(c.volume(), len(idx), i, idx, c) for i, (c, idx) in enumerate(zip(cubes, indices)) 95 | if (idx is not None) and (not node.cube.equal(c))] 96 | if len(cubes) < 1: 97 | continue 98 | _, _, _, indices, cube = max(cubes) 99 | 100 | cube.update(node.dataframe[indices], self._predictor) 101 | node.right = Node(node.dataframe[indices], cube) 102 | node.cube.update(node.dataframe[~indices], self._predictor) 103 | node.left = Node(node.dataframe[~indices], node.cube) 104 | 105 | if depth < self.depth and cube.diversity > self.error_threshold: 106 | to_split.append((cube.diversity, depth + 1, np.random.uniform(), node.right)) 107 | return self._node_to_cubes(surrounding) 108 | 109 | def _node_to_cubes(self, root: Node) -> list[ClosedCube]: 110 | if root.right is None: 111 | return [root.cube] 112 | else: 113 | return self._node_to_cubes(root.right) + self._node_to_cubes(root.left) 114 | 115 | def _default_cube(self) -> Union[ClosedCube, ClosedRegressionCube, ClosedClassificationCube]: 116 | if self._output == Target.CONSTANT: 117 | return ClosedCube() 118 | if self._output == Target.REGRESSION: 119 | return ClosedRegressionCube() 120 | return ClosedClassificationCube() 121 | -------------------------------------------------------------------------------- /psyke/extraction/cart/CartPredictor.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterable 2 | from typing import Union, Any 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 6 | from tuprolog.core import clause, Var, Struct 7 | from tuprolog.theory import Theory, mutable_theory 8 | 9 | from psyke.extraction.cart import LeafConstraints, LeafSequence 10 | from psyke.schema import LessThan, GreaterThan, SchemaException, DiscreteFeature 11 | from psyke.utils.logic import create_variable_list, create_head, create_term 12 | 13 | 14 | class CartPredictor: 15 | """ 16 | A wrapper for decision and regression trees of sklearn. 17 | """ 18 | 19 | def __init__(self, predictor: Union[DecisionTreeClassifier, DecisionTreeRegressor] = DecisionTreeClassifier(), 20 | discretization=None, normalization=None): 21 | self._predictor = predictor 22 | self.discretization = discretization 23 | self.normalization = normalization 24 | 25 | def __get_constraints(self, nodes: Iterable[tuple[int, bool]]) -> LeafConstraints: 26 | thresholds = [self._predictor.tree_.threshold[i[0]] for i in nodes] 27 | features = [self._predictor.feature_names_in_[self._predictor.tree_.feature[node[0]]] for node in nodes] 28 | conditions = [node[1] for node in nodes] 29 | if self.normalization is not None: 30 | thresholds = [threshold * self.normalization[feature][1] + self.normalization[feature][0] 31 | for feature, threshold in zip(features, thresholds)] 32 | cond_dict = {} 33 | for feature, condition, threshold in zip(features, conditions, thresholds): 34 | cond = LessThan(threshold) if condition else GreaterThan(threshold) 35 | if feature in cond_dict: 36 | try: 37 | cond_dict[feature][-1] *= cond 38 | except SchemaException: 39 | cond_dict[feature].append(cond) 40 | else: 41 | cond_dict[feature] = [cond] 42 | return cond_dict 43 | 44 | def __get_leaves(self) -> Iterable[int]: 45 | return [i for i, (left_child, right_child) in enumerate(zip( 46 | self._left_children, self._right_children 47 | )) if left_child == -1 and right_child == -1] 48 | 49 | def __get_prediction(self, node: int) -> Any: 50 | if hasattr(self._predictor, 'classes_'): 51 | return self._predictor.classes_[np.argmax(self._predictor.tree_.value[node])] 52 | else: 53 | return self._predictor.tree_.value[node] 54 | 55 | def __path(self, node: int, path=None) -> Iterable[tuple[int, bool]]: 56 | path = [] if path is None else path 57 | if node == 0: 58 | return path 59 | father = list(self._left_children if node in self._left_children else self._right_children).index(node) 60 | return self.__path(father, [(father, node in self._left_children)] + path) 61 | 62 | def __iter__(self) -> LeafSequence: 63 | leaves = self.__get_leaves() 64 | return ((self.__get_constraints(self.__path(i)), self.__get_prediction(i)) for i in leaves) 65 | 66 | def predict(self, data) -> Iterable: 67 | return self._predictor.predict(data) 68 | 69 | @staticmethod 70 | def _simplify_nodes(nodes: list) -> Iterable: 71 | simplified = [nodes.pop(0)] 72 | while len(nodes) > 0: 73 | first_node = nodes[0][0] 74 | for k, conditions in first_node.items(): 75 | for condition in conditions: 76 | if all(k in node[0] and condition in node[0][k] for node in nodes): 77 | [node[0][k].remove(condition) for node in nodes] 78 | simplified.append(nodes.pop(0)) 79 | return [({k: v for k, v in rule.items() if v != []}, prediction) for rule, prediction in simplified] 80 | 81 | def _create_body(self, variables: dict[str, Var], conditions: LeafConstraints) -> Iterable[Struct]: 82 | results = [] 83 | for feature_name, cond_list in conditions.items(): 84 | for condition in cond_list: 85 | feature: DiscreteFeature = [d for d in self.discretization if feature_name in d.admissible_values][0] \ 86 | if self.discretization else None 87 | results.append(create_term(variables[feature_name], condition) if feature is None else 88 | create_term(variables[feature.name], 89 | feature.admissible_values[feature_name], 90 | isinstance(condition, GreaterThan))) 91 | return results 92 | 93 | def create_theory(self, data: pd.DataFrame, simplify: bool = True) -> Theory: 94 | new_theory = mutable_theory() 95 | nodes = [node for node in self] 96 | nodes = self._simplify_nodes(nodes) if simplify else nodes 97 | for (constraints, prediction) in nodes: 98 | if self.normalization is not None and data.columns[-1] in self.normalization: 99 | m, s = self.normalization[data.columns[-1]] 100 | prediction = prediction * s + m 101 | variables = create_variable_list(self.discretization, data) 102 | new_theory.assertZ( 103 | clause( 104 | create_head(data.columns[-1], list(variables.values()), prediction), 105 | self._create_body(variables, constraints) 106 | ) 107 | ) 108 | return new_theory 109 | 110 | @property 111 | def predictor(self) -> Union[DecisionTreeClassifier, DecisionTreeRegressor]: 112 | return self._predictor 113 | 114 | @property 115 | def n_leaves(self) -> int: 116 | return len(list(self.__get_leaves())) 117 | 118 | @property 119 | def _left_children(self) -> list[int]: 120 | return self._predictor.tree_.children_left 121 | 122 | @property 123 | def _right_children(self) -> list[int]: 124 | return self._predictor.tree_.children_right 125 | 126 | @predictor.setter 127 | def predictor(self, predictor: Union[DecisionTreeClassifier, DecisionTreeRegressor]): 128 | self._predictor = predictor 129 | -------------------------------------------------------------------------------- /psyke/extraction/real/__init__.py: -------------------------------------------------------------------------------- 1 | from functools import lru_cache 2 | from psyke.extraction import PedagogicalExtractor 3 | from psyke.extraction.real.utils import Rule, IndexedRuleSet 4 | from psyke.schema import DiscreteFeature 5 | from psyke.utils.dataframe import HashableDataFrame 6 | from psyke.utils.logic import create_term, create_head, create_variable_list 7 | from tuprolog.core import Var, Struct, Clause, clause 8 | from tuprolog.theory import MutableTheory, mutable_theory, Theory 9 | from typing import Iterable 10 | import pandas as pd 11 | import numpy as np 12 | 13 | 14 | class REAL(PedagogicalExtractor): 15 | """ 16 | Explanator implementing Rule Extraction As Learning (REAL) algorithm, doi:10.1016/B978-1-55860-335-6.50013-1. 17 | The algorithm is sensible to features' order in the provided dataset during extraction. 18 | """ 19 | 20 | def __init__(self, predictor, discretization: Iterable[DiscreteFeature]): 21 | super().__init__(predictor, discretization) 22 | self._ignore_feature = [] 23 | self._ruleset: IndexedRuleSet = IndexedRuleSet() 24 | 25 | @property 26 | def n_rules(self): 27 | return len(self._ruleset.flatten()) 28 | 29 | def _covers(self, sample: pd.Series, rules: list[Rule]) -> bool: 30 | new_rule = self._rule_from_example(sample) 31 | return any([new_rule in rule for rule in rules]) 32 | 33 | def _body(self, variables: dict[str, Var], rule: Rule) -> list[Struct]: 34 | result = [] 35 | for predicates, truth_value in zip(rule.to_lists(), [True, False]): 36 | for predicate in predicates: 37 | feature = [feature for feature in self.discretization if predicate in feature.admissible_values][0] 38 | result.append(create_term(variables[feature.name], feature.admissible_values[predicate], truth_value)) 39 | return result 40 | 41 | def _create_clause(self, dataset: pd.DataFrame, variables: dict[str, Var], key: int, rule: Rule) -> Clause: 42 | return clause(create_head(dataset.columns[-1], list(variables.values()), key), self._body(variables, rule)) 43 | 44 | def _create_new_rule(self, sample: pd.Series) -> Rule: 45 | rule = self._rule_from_example(sample) 46 | return self._generalise(rule, sample) 47 | 48 | def _create_ruleset(self, dataset: pd.DataFrame) -> IndexedRuleSet: 49 | ruleset = IndexedRuleSet.create_indexed_ruleset(sorted(set(dataset.iloc[:, -1]))) 50 | for _, sample in dataset.iloc[:, :-1].iterrows(): 51 | prediction = list(self.predictor.predict(sample.to_frame().transpose()))[0] 52 | rules = ruleset.get(prediction) 53 | if not self._covers(sample, rules): 54 | rules.append(self._create_new_rule(sample)) 55 | return ruleset.optimize() 56 | 57 | def _create_theory(self, dataset: pd.DataFrame) -> MutableTheory: 58 | theory = mutable_theory() 59 | for key, rule in self._ruleset.flatten(): 60 | variables = create_variable_list(self.discretization) 61 | theory.assertZ(self._create_clause(dataset, variables, key, rule)) 62 | return theory 63 | 64 | def _generalise(self, rule: Rule, sample: pd.Series) -> Rule: 65 | mutable_rule = rule.to_lists() 66 | samples = sample.to_frame().transpose() 67 | for predicate in rule.true_predicates: 68 | samples = self._remove_antecedent(samples.copy(), predicate, mutable_rule) 69 | return Rule(mutable_rule[0], mutable_rule[1]).reduce(self.discretization) 70 | 71 | def _remove_antecedent(self, samples: pd.DataFrame, predicate: str, rule: list[list[str]]) -> (pd.DataFrame, bool): 72 | feature = [feature for feature in self.discretization if predicate in feature.admissible_values][0] 73 | output = np.array(self.predictor.predict(samples)) 74 | copies = [samples.copy()] 75 | samples[predicate] = 0 76 | for f in [f for f in feature.admissible_values if f != predicate]: 77 | copy = samples.copy() 78 | copy[f] = 1 79 | if all(output == np.array(self.predictor.predict(copy))): 80 | copies.append(copy) 81 | rule[1].remove(f) 82 | if len(copies) > 1: 83 | rule[0].remove(predicate) 84 | return pd.concat([df for df in copies], ignore_index=True) 85 | 86 | @lru_cache(maxsize=512) 87 | def _get_or_set(self, dataset: HashableDataFrame) -> IndexedRuleSet: 88 | return self._create_ruleset(dataset) 89 | 90 | def _internal_predict(self, sample: pd.Series): 91 | x = [index for index, rule in self._ruleset.flatten() if self._rule_from_example(sample) in rule] 92 | return x[0] if x else None 93 | 94 | def make_fair(self, features: Iterable[str]): 95 | self._ignore_feature = [list(i.admissible_values.keys()) for i in self.discretization if i.name in features] \ 96 | if self.discretization else [features] 97 | self._ignore_feature = [feature for features in self._ignore_feature for feature in features] 98 | self._get_or_set.cache_clear() 99 | 100 | def _rule_from_example(self, sample: pd.Series) -> Rule: 101 | true_predicates, false_predicates = [], [] 102 | for feature, value in sample.items(): 103 | if feature in self._ignore_feature: 104 | continue 105 | true_predicates.append(str(feature)) if value == 1 else false_predicates.append(str(feature)) 106 | return Rule(true_predicates, false_predicates) 107 | 108 | def _subset(self, samples: pd.DataFrame, predicate: str) -> (pd.DataFrame, bool): 109 | samples_0 = samples.copy() 110 | samples_0[predicate].values[:] = 0 111 | samples_1 = samples.copy() 112 | samples_1[predicate].values[:] = 1 113 | samples_all = samples_0.append(samples_1) 114 | return samples_all, len(set(self.predictor.predict(samples_all))) == 1 115 | 116 | def _extract(self, dataframe: pd.DataFrame) -> Theory: 117 | self._ruleset = self._get_or_set(HashableDataFrame(dataframe)) 118 | return self._create_theory(dataframe) 119 | 120 | def _predict(self, dataframe) -> Iterable: 121 | return np.array([self._internal_predict(data.transpose()) for _, data in dataframe.iterrows()]) 122 | -------------------------------------------------------------------------------- /psyke/genetic/gin/__init__.py: -------------------------------------------------------------------------------- 1 | from statistics import mode 2 | 3 | import numpy as np 4 | from deap import base, creator, tools, algorithms 5 | import random 6 | from sklearn.linear_model import LinearRegression 7 | from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, f1_score, accuracy_score 8 | from sklearn.preprocessing import PolynomialFeatures 9 | 10 | from psyke import Target 11 | 12 | 13 | class GIn: 14 | 15 | def __init__(self, train, valid, features, sigmas, slices, min_rules=1, poly=1, alpha=0.5, indpb=0.5, tournsize=3, 16 | metric='R2', output=Target.REGRESSION, warm=False): 17 | self.X, self.y = train 18 | self.valid = valid 19 | self.output = output 20 | 21 | self.features = features 22 | self.sigmas = sigmas 23 | self.slices = slices 24 | self.min_rules = min_rules 25 | self.poly = PolynomialFeatures(degree=poly, include_bias=False) 26 | 27 | self.alpha = alpha 28 | self.indpb = indpb 29 | self.tournsize = tournsize 30 | self.metric = metric 31 | 32 | self.toolbox = None 33 | self.stats = None 34 | self.hof = None 35 | self.best = None 36 | 37 | self.__setup(warm) 38 | 39 | def _region(self, x, cuts): 40 | indices = [np.searchsorted(np.array(cut), x[f].to_numpy(), side='right') 41 | for cut, f in zip(cuts, self.features)] 42 | 43 | regions = np.zeros(len(x), dtype=int) 44 | multiplier = 1 45 | for idx, n in zip(reversed(indices), reversed([len(cut) + 1 for cut in cuts])): 46 | regions += idx * multiplier 47 | multiplier *= n 48 | 49 | return regions 50 | 51 | def _output_estimation(self, mask, to_pred): 52 | if self.output == Target.REGRESSION: 53 | return LinearRegression().fit(self.poly.fit_transform(self.X)[mask], self.y[mask]).predict( 54 | self.poly.fit_transform(to_pred)) 55 | if self.output == Target.CONSTANT: 56 | return np.mean(self.y[mask]) 57 | if self.output == Target.CLASSIFICATION: 58 | return mode(self.y[mask]) 59 | raise ValueError('Supported outputs are Target.{REGRESSION, CONSTANT, CLASSIFICATION}') 60 | 61 | def _score(self, true, pred): 62 | if self.metric == 'R2': 63 | return r2_score(true, pred) 64 | if self.metric == 'MAE': 65 | return -mean_absolute_error(true, pred) 66 | if self.metric == 'MSE': 67 | return -mean_squared_error(true, pred) 68 | if self.metric == 'F1': 69 | return f1_score(true, pred, average='weighted') 70 | if self.metric == 'ACC': 71 | return accuracy_score(true, pred) 72 | raise ValueError('Supported metrics are R2, MAE, MSE, F1, ACC') 73 | 74 | def predict(self, to_pred): 75 | return self.__predict(to_pred=to_pred)[0] 76 | 77 | def _get_cuts(self, individual): 78 | boundaries = np.cumsum([0] + list(self.slices)) 79 | return [sorted(individual[boundaries[i]:boundaries[i + 1]]) for i in range(len(self.slices))] 80 | 81 | def __predict(self, individual=None, to_pred=None): 82 | cuts = self._get_cuts(individual or self.best) 83 | 84 | regions = self._region(to_pred, cuts) 85 | regionsT = self._region(self.X, cuts) 86 | 87 | pred = np.empty(len(to_pred), dtype=f'U{self.y.str.len().max()}') if self.output == Target.CLASSIFICATION \ 88 | else np.zeros(len(to_pred)) 89 | valid_regions = 0 90 | 91 | for r in range(np.prod([s + 1 for s in self.slices])): 92 | mask = regions == r 93 | maskT = regionsT == r 94 | if min(mask.sum(), maskT.sum()) < 3: 95 | if self.output != Target.CLASSIFICATION: 96 | pred[mask] = np.mean(self.y) 97 | continue 98 | pred[mask] = self._output_estimation(maskT, to_pred[mask]) 99 | valid_regions += 1 100 | 101 | return pred, valid_regions 102 | 103 | def _evaluate(self, individual=None): 104 | y_pred, valid_regions = self.__predict(individual or self.best, self.X if self.valid is None else self.valid[0]) 105 | if valid_regions < self.min_rules: 106 | return -9999, 107 | return self._score(self.y if self.valid is None else self.valid[1], y_pred), 108 | 109 | def __setup(self, warm=False): 110 | if not warm: 111 | creator.create("FitnessMax", base.Fitness, weights=(1.0,)) 112 | creator.create("Individual", list, fitness=creator.FitnessMax) 113 | 114 | self.toolbox = base.Toolbox() 115 | for f in self.features: 116 | self.toolbox.register(f, random.uniform, self.X[f].min(), self.X[f].max()) 117 | 118 | self.toolbox.register("individual", tools.initCycle, creator.Individual, 119 | (sum([[getattr(self.toolbox, f) for i in range(s)] 120 | for f, s in zip(self.features, self.slices)], [])), n=1) 121 | 122 | self.toolbox.register("population", tools.initRepeat, list, self.toolbox.individual) 123 | 124 | self.toolbox.register("mate", tools.cxBlend, alpha=self.alpha) 125 | self.toolbox.register("mutate", tools.mutGaussian, indpb=self.indpb, mu=0, 126 | sigma=sum([[sig] * s for sig, s in zip(self.sigmas, self.slices)], [])) 127 | self.toolbox.register("select", tools.selTournament, tournsize=self.tournsize) 128 | self.toolbox.register("evaluate", self._evaluate) 129 | 130 | self.stats = tools.Statistics(lambda ind: ind.fitness.values[0]) 131 | self.stats.register("avg", np.mean) 132 | # self.stats.register("min", np.min) 133 | self.stats.register("max", np.max) 134 | # self.stats.register("std", np.std) 135 | 136 | self.hof = tools.HallOfFame(1) 137 | 138 | def run(self, n_pop=30, cxpb=0.8, mutpb=0.5, n_gen=50, seed=123): 139 | random.seed(seed) 140 | pop = self.toolbox.population(n=n_pop) 141 | result, log = algorithms.eaSimple(pop, self.toolbox, cxpb=cxpb, mutpb=mutpb, ngen=n_gen, 142 | stats=self.stats, halloffame=self.hof, verbose=False) 143 | self.best = tools.selBest(pop, 1)[0] 144 | return self.best, self._evaluate()[0], result, log 145 | -------------------------------------------------------------------------------- /CITATION.md: -------------------------------------------------------------------------------- 1 | The following publications about PSyKE can be cited: 2 | 3 | ### First Design 4 | 5 | > Federico Sabbatini, Giovanni Ciatto, Roberta Calegari, Andrea Omicini. “[Symbolic Knowledge Extraction from Opaque ML Predictors in PSyKE: Platform Design & Experiments](https://doi.org/10.3233/IA-210120)”. Intelligenza Artificiale 16(1):27–48, 2022. doi: 10.3233/IA-210120 6 | 7 | ```bibtex 8 | @article{psyke-ia2022, 9 | title = {Symbolic Knowledge Extraction from Opaque {ML} Predictors in {PSyKE}: Platform Design {\&} Experiments}, 10 | author = {Sabbatini, Federico and Ciatto, Giovanni and Calegari, Roberta and Omicini, Andrea}, 11 | journal = {Intelligenza Artificiale}, 12 | volume = {16}, 13 | number = {1}, 14 | pages = {27--48}, 15 | year = {2022}, 16 | doi = {10.3233/IA-210120}, 17 | url = {https://doi.org/10.3233/IA-210120}, 18 | bdsk-url-1 = {https://doi.org/10.3233/IA-210120} 19 | } 20 | ``` 21 | 22 | > Federico Sabbatini, Giovanni Ciatto, Roberta Calegari, Andrea Omicini. “[On the Design of PSyKE: A Platform for Symbolic Knowledge Extraction](http://ceur-ws.org/Vol-2963/paper14.pdf)”, in: WOA 2021 – 22nd Workshop “From Objects to Agents”, CEUR Workshop Proceedings vol. 2963, pp. 29–48, Sun SITE Central Europe, RWTH Aachen University, 2021. 23 | 24 | ```bibtex 25 | @inproceedings{psyke-woa2021, 26 | title = {On the Design of {PSyKE}: A Platform for Symbolic Knowledge Extraction}, 27 | author = {Sabbatini, Federico and Ciatto, Giovanni and Calegari, Roberta and Omicini, Andrea}, 28 | booktitle = {{WOA} 2021 -- 22nd Workshop ``From Objects to Agents''}, 29 | series = {CEUR Workshop Proceedings}, 30 | volume = {2963}, 31 | pages = {29--48}, 32 | year = {2021}, 33 | editor = {Calegari, Roberta and Ciatto, Giovanni and Denti, Enrico and Omicini, Andrea and Sartor, Giovanni}, 34 | publisher = {Sun SITE Central Europe, RWTH Aachen University}, 35 | url = {https://ceur-ws.org/Vol-2963/paper14.pdf} 36 | } 37 | ``` 38 | 39 | ### Visual Fairness Support 40 | 41 | > Federico Sabbatini. “A Fairness-Oriented Visual Extension for the PSyKE Platform”, in: AEQUITAS 2025 – 3rd Workshop on Fairness and Bias in AI. To appear on CEUR Workshop Proceedings, 2025 42 | 43 | ```bibtex 44 | @inproceedings{psyke-2025, 45 | title = {A Fairness-Oriented Visual Extension for the {PSyKE} Platform}, 46 | author = {Sabbatini, Federico}, 47 | booktitle = {Proceedings of the 3rd {AEQUITAS} Workshop on Fairness and Bias in AI, Bologna, Italy, October 26, 2025}, 48 | series = {{CEUR} Workshop Proceedings}, 49 | volume = {}, 50 | pages = {}, 51 | year = {2025}, 52 | editor = {}, 53 | publisher = {Sun SITE Central Europe, RWTH Aachen University}, 54 | url = {} 55 | } 56 | ``` 57 | 58 | ### Explainable Clustering Support 59 | 60 | > Federico Sabbatini, Roberta Calegari. “[Unlocking Insights and Trust: The Value of Explainable Clustering Algorithms for Cognitive Agents](https://ceur-ws.org/Vol-3579/paper18.pdf)”, in: WOA 2023 – 24th Workshop “From Objects to Agents”, CEUR Workshop Proceedings vol. 3579, pp. 232–245, Sun SITE Central Europe, RWTH Aachen University, 2023. 61 | 62 | ```bibtex 63 | @inproceedings{clustering-woa2023, 64 | title = {Unlocking Insights and Trust: {The} Value of Explainable Clustering Algorithms for Cognitive Agents}, 65 | author = {Sabbatini, Federico and Calegari, Roberta}, 66 | booktitle = {Proceedings of the 24th Workshop ``From Objects to Agents'', Roma, Italy, November 6--8, 2023}, 67 | series = {{CEUR} Workshop Proceedings}, 68 | volume = {3579}, 69 | pages = {232--245}, 70 | year = {2023}, 71 | editor = {Falcone, Rino and Castelfranchi, Cristiano and Sapienza, Alessandro and Cantucci, Filippo}, 72 | publisher = {Sun SITE Central Europe, RWTH Aachen University}, 73 | url = {https://ceur-ws.org/Vol-3579/paper18.pdf} 74 | } 75 | ``` 76 | 77 | ### PSyKE for Trustworthy AI 78 | 79 | > Roberta Calegari, Federico Sabbatini. “[The PSyKE Technology for Trustworthy Artificial Intelligence](https://link.springer.com/chapter/10.1007/978-3-031-27181-6_1)”, in: AIxIA 2022 – Proceedings of the XXI International Conference of the Italian Association for Artificial Intelligence, Lecture Notes in Computer Science vol. 13796, pp. 3–16, Springer, Cham, Switzerland, 2023. doi: 10.1007/978-3-031-27181-6_1 80 | 81 | ```bibtex 82 | @inproceedings{psyke-trust-aixia2022, 83 | title = {The {PSyKE} Technology for Trustworthy Artificial Intelligence}, 84 | author = {Calegari, Roberta and Sabbatini, Federico}, 85 | booktitle = {{AIxIA} 2022 – Proceedings of the XXI International Conference of the Italian Association for Artificial Intelligence}, 86 | series = {Lecture Notes in Computer Science}, 87 | volume = {13796}, 88 | pages = {3--16}, 89 | year = {2023}, 90 | editor = {Dovier, Agostino and Montanari, Angelo and Orlandini, Andrea}, 91 | publisher = {Springer}, 92 | address = {Cham, Switzerland}, 93 | doi = {10.1007/978-3-031-27181-6_1}, 94 | url = {https://doi.org/10.1007/978-3-031-27181-6_1} 95 | } 96 | ``` 97 | 98 | ### Semantic Web Support 99 | 100 | > Federico Sabbatini, Giovanni Ciatto, Andrea Omicini. “[Semantic Web-Based Interoperability for Intelligent Agents with PSyKE](https://doi.org/10.1007/978-3-031-15565-9_8)”, in: EXTRAAMAS 2022 – 4th International Workshop on Explainable and Transparent AI and Multi-Agent Systems. Lecture Notes in Computer Science vol. 13283, pp. 124–142. Springer, Cham, Switzerland, 2022. doi: 10.1007/978-3-031-15565-9_8 101 | 102 | ```bibtex 103 | @incollection{psyke-extraamas2022, 104 | title = {{Semantic Web}-Based Interoperability for Intelligent Agents with {PSyKE}}, 105 | author = {Sabbatini, Federico and Ciatto, Giovanni and Omicini, Andrea}, 106 | booktitle = {EXTRAAMAS 2022 – 4th International Workshop on Explainable and Transparent AI and Multi-Agent Systems}, 107 | series = {Lecture Notes in Computer Science}, 108 | volume = {13283}, 109 | pages = {124--142}, 110 | year = {2022}, 111 | editor = {Calvaresi, Davide and Najjar, Amro and Winikoff, Michael and Fr{\"a}mling, Kary}, 112 | publisher = {Springer}, 113 | address = {Cham, Switzerland}, 114 | doi = {10.1007/978-3-031-15565-9_8}, 115 | url = {https://link.springer.com/10.1007/978-3-031-15565-9_8} 116 | } 117 | ``` -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Iterable, Union 4 | import numpy as np 5 | import onnxruntime 6 | import pandas as pd 7 | from keras import Input, Model 8 | from keras.src.layers import Dense 9 | from onnxconverter_common import FloatTensorType, Int64TensorType, StringTensorType, DataType 10 | from sklearn.ensemble import RandomForestRegressor 11 | from sklearn.neighbors import KNeighborsClassifier 12 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 13 | from tensorflow.random import set_seed 14 | from psyke.schema import DiscreteFeature, Value 15 | from psyke.utils import get_default_random_seed 16 | from sklearn.datasets import load_iris 17 | from psyke import Extractor 18 | from psyke.utils.dataframe import get_discrete_features_supervised 19 | from test.resources.datasets import open_dataset 20 | from test.resources.predictors import PATH, get_predictor_path 21 | 22 | REQUIRED_PREDICTORS: str = PATH / '.required.csv' 23 | LE = '=<' 24 | GE = '>=' 25 | L = '<' 26 | G = '>' 27 | 28 | 29 | def get_extractor(extractor_type: str, parameters: dict): 30 | if extractor_type.lower() == 'cart': 31 | return Extractor.cart(**parameters) 32 | elif extractor_type.lower() == 'iter': 33 | return Extractor.iter(**parameters) 34 | elif extractor_type.lower() == 'real': 35 | return Extractor.real(**parameters) 36 | elif extractor_type.lower() == 'trepan': 37 | return Extractor.trepan(**parameters) 38 | elif extractor_type.lower() == 'gridex': 39 | return Extractor.gridex(**parameters) 40 | else: 41 | raise NotImplementedError(extractor_type + ' not implemented yet.') 42 | 43 | 44 | def get_model(model_type: str, parameters: dict): 45 | if model_type.lower() == 'rfr': 46 | return RandomForestRegressor(**parameters, random_state=np.random.seed(get_default_random_seed())), False 47 | elif model_type.lower() == 'knnc': 48 | return KNeighborsClassifier(**parameters), False 49 | elif model_type.lower() == 'dtc': 50 | return DecisionTreeClassifier(max_depth=3, random_state=np.random.seed(get_default_random_seed())), False 51 | elif model_type.lower() == 'dtr': 52 | return DecisionTreeRegressor(max_depth=3, random_state=np.random.seed(get_default_random_seed())), False 53 | elif model_type.lower() == 'nn': 54 | return get_simple_neural_network(**parameters, random_state=np.random.seed(get_default_random_seed())), False 55 | else: 56 | return Predictor.load_from_onnx(str(get_predictor_path(model_type))), True 57 | 58 | 59 | def get_simple_neural_network(input: int = 4, output: int = 3, layers: int = 3, neurons: int = 32, 60 | random_state: int = np.random.seed(get_default_random_seed())) -> Model: 61 | set_seed(random_state) 62 | input_layer = Input(input) 63 | x = input_layer 64 | for _ in range(layers-1): 65 | x = Dense(neurons, activation='relu')(x) 66 | x = Dense(output, activation='softmax')(x) 67 | model = Model(input_layer, x) 68 | model.compile('adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) 69 | return model 70 | 71 | 72 | def get_dataset(name: str): 73 | if name.lower() == 'house': 74 | x = pd.read_csv(open_dataset('houseX'), index_col=[0]) 75 | y = pd.read_csv(open_dataset('housey'), index_col=[0]).MedHouseVal 76 | normalized_x = _normalize_data(x) 77 | normalized_y = _normalize_data(y) 78 | return normalized_x.join(normalized_y) 79 | elif name.lower() == 'iris': 80 | x, y = load_iris(return_X_y=True, as_frame=True) 81 | y = pd.DataFrame(y).replace({"target": {0: 'setosa', 1: 'versicolor', 2: 'virginica'}}) 82 | result = x.join(y) 83 | result.columns = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'iris'] 84 | return result 85 | else: 86 | raise Exception('unknown dataset name.') 87 | 88 | 89 | def _normalize_data(x: pd.DataFrame) -> pd.DataFrame: 90 | return (x - x.min()) / (x.max() - x.min()) 91 | 92 | 93 | def get_schema(dataset: pd.DataFrame) -> Union[Iterable[DiscreteFeature], None]: 94 | return get_discrete_features_supervised(dataset) 95 | # return SCHEMAS[filename] if filename in SCHEMAS.keys() else None 96 | 97 | 98 | def _get_admissible_values(prepositions: Iterable[str]) -> dict[str, Value]: 99 | raise NotImplementedError('Automatic schema reading not implemented yet.') 100 | 101 | 102 | class Predictor: 103 | 104 | def __init__(self, model, from_file_onnx=False): 105 | self._model = model 106 | self._from_file_onnx = from_file_onnx 107 | 108 | @staticmethod 109 | def load_from_onnx(file: str) -> Predictor: 110 | return Predictor(onnxruntime.InferenceSession(file), True) 111 | 112 | #def save_to_onnx(self, file, initial_types: list[tuple[str, DataType]]): 113 | # file = str(file) + '.onnx' 114 | # if not self._from_file_onnx: 115 | # if os.path.exists(file): 116 | # os.remove(file) 117 | # if isinstance(self._model, Model): 118 | # save(self._model, "tmp_model") 119 | # os.system("python -m tf2onnx.convert --saved-model tmp_model --output " + file) 120 | # else: 121 | # onnx_predictor = convert_sklearn(self._model, initial_types=initial_types) 122 | # with open(file, 'wb') as f: 123 | # f.write(onnx_predictor.SerializeToString()) 124 | 125 | def predict(self, dataset: pd.DataFrame | np.ndarray) -> Iterable: 126 | array = dataset.to_numpy() if isinstance(dataset, pd.DataFrame) else dataset 127 | if self._from_file_onnx: 128 | input_name = self._model.get_inputs()[0].name 129 | label_name = self._model.get_outputs()[0].name 130 | if array.dtype == 'float64': 131 | tensor_type = np.float32 132 | elif array.dtype == 'int64' or array.dtype == 'int32': 133 | tensor_type = np.int64 134 | else: 135 | tensor_type = np.str 136 | pred_onx = self._model.run([label_name], {input_name: array.astype(tensor_type)})[0] 137 | return [prediction for plist in pred_onx for prediction in plist] if isinstance(pred_onx[0], list) \ 138 | else [prediction for prediction in pred_onx] 139 | else: 140 | return self._model.predict(dataset) 141 | 142 | # TODO: to be improved, make it more flexible 143 | @staticmethod 144 | def get_initial_types(dataset: pd.DataFrame | np.ndarray) -> list[tuple[str, DataType]]: 145 | array = dataset.to_numpy() if isinstance(dataset, pd.DataFrame) else dataset 146 | name = '' 147 | for column in dataset.columns: 148 | name += column + ', ' 149 | name = name[:-2] 150 | shape = [None, array.shape[1]] 151 | if array.dtype == 'float64': 152 | types = FloatTensorType(shape) 153 | elif array.dtype == 'int64': 154 | types = Int64TensorType(shape) 155 | else: 156 | types = StringTensorType(shape) 157 | return [(name, types)] 158 | -------------------------------------------------------------------------------- /psyke/tuning/pedro/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from enum import Enum 4 | 5 | from sklearn.metrics import accuracy_score 6 | 7 | from psyke import Extractor, Target 8 | from psyke.extraction.hypercubic import Grid, FeatureRanker 9 | from psyke.extraction.hypercubic.strategy import AdaptiveStrategy, FixedStrategy 10 | from psyke.tuning import Objective, IterativeOptimizer, SKEOptimizer 11 | 12 | 13 | class PEDRO(SKEOptimizer, IterativeOptimizer): 14 | class Algorithm(Enum): 15 | GRIDEX = 1, 16 | GRIDREX = 2, 17 | HEX = 3 18 | 19 | def __init__(self, predictor, dataframe: pd.DataFrame, max_error_increase: float = 1.2, 20 | min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, max_depth: int = 3, 21 | patience: int = 3, algorithm: Algorithm = Algorithm.GRIDREX, objective: Objective = Objective.MODEL, 22 | output: Target = Target.CONSTANT, normalization=None, discretization=None): 23 | SKEOptimizer.__init__(self, predictor, dataframe, max_error_increase, min_rule_decrease, 24 | readability_tradeoff, patience, objective, output, normalization, discretization) 25 | IterativeOptimizer.__init__(self, dataframe, max_error_increase, min_rule_decrease, readability_tradeoff, 26 | max_depth, patience, output, normalization, discretization) 27 | self.algorithm = Extractor.gridrex if algorithm == PEDRO.Algorithm.GRIDREX else \ 28 | Extractor.gridex if algorithm == PEDRO.Algorithm.GRIDEX else Extractor.hex 29 | self.algorithm_name = "GridREx" if algorithm == PEDRO.Algorithm.GRIDREX else \ 30 | "GridEx" if algorithm == PEDRO.Algorithm.GRIDEX else "HEx" 31 | self.ranked = FeatureRanker(dataframe.columns[:-1]).fit(predictor, dataframe.iloc[:, :-1]).rankings() 32 | predictions = self.predictor.predict(dataframe.iloc[:, :-1]).flatten() 33 | expected = self.dataframe.iloc[:, -1].values 34 | self.error = 1 - accuracy_score(predictions, expected) if output == Target.CLASSIFICATION else \ 35 | abs(predictions - expected).mean() 36 | 37 | def _search_depth(self, strategy, critical, max_partitions): 38 | params, best = [], None 39 | 40 | for iterations in range(self.max_depth): 41 | current_params = self.__search_threshold(Grid(iterations + 1, strategy), critical, max_partitions) 42 | current_best = self._best(current_params)[1] 43 | print() 44 | best, to_break = self._check_iteration_improvement(best, current_best) 45 | params += current_params 46 | 47 | if len(params) > 1 and to_break: 48 | break 49 | return params 50 | 51 | def __search_threshold(self, grid, critical, max_partitions): 52 | step = self.error / 2.0 53 | threshold = self.error * 0.5 54 | params = [] 55 | patience = self.patience 56 | while patience > 0: 57 | print("{}. {}. Threshold = {:.2f}. ".format(self.algorithm_name, grid, threshold), end="") 58 | param_dict = dict(min_examples=25, threshold=threshold, normalization=self.normalization) 59 | if self.algorithm != Extractor.gridrex: 60 | param_dict['output'] = self.output 61 | extractor = self.algorithm(self.predictor, grid, **param_dict) 62 | _ = extractor.extract(self.dataframe) 63 | error_function = (lambda *x: 1 - extractor.accuracy(*x)) if self.output == Target.CLASSIFICATION \ 64 | else extractor.mae 65 | error, n = (error_function(self.dataframe, self.predictor) if self.objective == Objective.MODEL else 66 | error_function(self.dataframe)), extractor.n_rules 67 | print("MAE = {:.2f}, {} rules".format(error, n)) 68 | 69 | if len(params) == 0: 70 | params.append((error, n, threshold, grid)) 71 | threshold += step 72 | continue 73 | 74 | if n > max_partitions: 75 | break 76 | 77 | if n == 1: 78 | params.append((error, n, threshold, grid)) 79 | break 80 | 81 | if error > params[0][0] * self.max_error_increase: 82 | break 83 | 84 | improvement = (params[-1][0] / error) + (1 - n / params[-1][1]) 85 | 86 | if improvement <= 1 or n > np.ceil(params[-1][1] * self.min_rule_decrease): 87 | patience -= 1 88 | step = max(step, abs(error - threshold) / max(patience, 1)) 89 | elif not critical: 90 | patience = self.patience 91 | if error != params[-1][0] or n != params[-1][1]: 92 | params.append((error, n, threshold, grid)) 93 | threshold += step 94 | return params 95 | 96 | def __contains(self, strategies, strategy): 97 | for s in strategies: 98 | if strategy.equals(s, self.dataframe.columns[:-1]): 99 | return True 100 | return False 101 | 102 | def search(self): 103 | max_partitions = 200 104 | base_partitions = FixedStrategy(2).partition_number(self.dataframe.columns[:-1]) * 3 105 | if base_partitions <= max_partitions: 106 | strategies = [FixedStrategy(2)] 107 | if FixedStrategy(3).partition_number(self.dataframe.columns[:-1]) <= max_partitions: 108 | strategies.append(FixedStrategy(3)) 109 | else: 110 | strategies = [] 111 | base_partitions = max_partitions 112 | 113 | for n in [2, 3, 5, 10]: 114 | for th in [0.99, 0.75, 0.67, 0.5, 0.3]: 115 | strategy = AdaptiveStrategy(self.ranked, [(th, n)]) 116 | if strategy.partition_number(self.dataframe.columns[:-1]) < base_partitions and \ 117 | not self.__contains(strategies, strategy): 118 | strategies.append(strategy) 119 | 120 | for (a, b) in [(0.33, 0.67), (0.25, 0.75), (0.1, 0.9)]: 121 | strategy = AdaptiveStrategy(self.ranked, [(a, 2), (b, 3)]) 122 | if strategy.partition_number(self.dataframe.columns[:-1]) < base_partitions and \ 123 | not self.__contains(strategies, strategy): 124 | strategies.append(strategy) 125 | 126 | avg = 0. 127 | for strategy in strategies: 128 | avg += strategy.partition_number(self.dataframe.columns[:-1]) 129 | avg /= len(strategies) 130 | 131 | params = [] 132 | for strategy in strategies: 133 | params += self._search_depth(strategy, 134 | strategy.partition_number(self.dataframe.columns[:-1]) > avg, 135 | base_partitions) 136 | self.params = params 137 | 138 | def _print_params(self, name, params): 139 | print("**********************") 140 | print(f"Best {name}") 141 | print("**********************") 142 | print(f"Error = {params[0]:.2f}, {params[1]} rules") 143 | print(f"Threshold = {params[2]:.2f}") 144 | print(f"Iterations = {params[3].iterations}") 145 | print(f"Strategy = {params[3].strategy}") 146 | -------------------------------------------------------------------------------- /psyke/utils/dataframe.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | from hashlib import sha256 5 | from typing import Iterable, List 6 | import pandas as pd 7 | from pandas.core.util.hashing import hash_pandas_object 8 | from pandas.api.types import is_string_dtype, is_numeric_dtype, is_integer_dtype 9 | from sklearn.preprocessing import StandardScaler 10 | from sympy.core.containers import OrderedSet 11 | 12 | from psyke import DiscreteFeature 13 | from psyke.schema import LessThan, GreaterThan, Between, Value, Constant 14 | from psyke.utils import TypeNotAllowedException, Range 15 | 16 | 17 | def split_features(dataframe: pd.DataFrame) -> Iterable[DiscreteFeature]: 18 | result = [] 19 | features = {'V' + str(index + 1): column for index, column in enumerate(dataframe.columns)} 20 | for feature, column in features.items(): 21 | values = set(dataframe[column]) 22 | result.append(DiscreteFeature(feature, {feature + '_' + str(i): v for i, v in enumerate(values)})) 23 | return result 24 | 25 | 26 | def get_discrete_features_supervised(dataframe: pd.DataFrame) -> Iterable[DiscreteFeature]: 27 | result = OrderedSet() 28 | for feature in dataframe.columns[:-1]: 29 | result.add(DiscreteFeature(feature, create_set(feature, dataframe))) 30 | return result 31 | 32 | 33 | def create_set(feature: str, dataframe: pd.DataFrame) -> dict[str, Value]: 34 | if is_string_dtype(dataframe[feature]) or is_integer_dtype(dataframe[feature]): 35 | values = dataframe[feature].unique() 36 | elif is_numeric_dtype(dataframe[feature]): 37 | values = create_ranges(feature, dataframe) 38 | else: 39 | raise TypeNotAllowedException(dataframe[feature].dtype) 40 | return {"{}_{}".format(feature, i): create_original_value(v) for (i, v) in enumerate(values)} 41 | 42 | 43 | def create_original_value(value: Range | str | int) -> Value: 44 | if isinstance(value, Range): 45 | if value.lower == float('-inf'): 46 | return LessThan(value.upper) 47 | if value.upper == float('inf'): 48 | return GreaterThan(value.lower) 49 | return Between(value.lower, value.upper) 50 | return Constant(value) 51 | 52 | 53 | def create_ranges(feature: str, dataframe: pd.DataFrame) -> Iterable[Range]: 54 | ranges = init_ranges(feature, dataframe) 55 | expand_ranges(ranges) 56 | ranges[0].left_infinite() 57 | ranges[-1].right_infinite() 58 | return ranges 59 | 60 | 61 | def expand_ranges(ranges: Iterable[Range]): 62 | for r1, r2 in zip(ranges[0:-1], ranges[1:]): 63 | while r1.upper < r2.lower: 64 | r1.expand_right() 65 | r2.expand_left() 66 | mean = ((r1.upper - r1.std + r2.lower + r2.std) / 2) 67 | r1.upper = mean 68 | r2.lower = mean 69 | 70 | 71 | def init_ranges(feature: str, dataframe: pd.DataFrame) -> Iterable[Range]: 72 | desc = [dataframe[dataframe.iloc[:, -1] == v].describe()[feature] for v in dataframe.iloc[:, -1].unique()] 73 | desc = [(d['mean'], d['std']) for d in desc] 74 | desc.sort() 75 | return [Range(d[0], d[1]) for d in desc] 76 | 77 | 78 | def get_discrete_features_equal_frequency( 79 | dataframe: pd.DataFrame, 80 | bins: int = None, 81 | output: bool = True, 82 | bin_names: List[str] = [] 83 | ) -> Iterable[DiscreteFeature]: 84 | features = dataframe.columns[:-1] if output else dataframe.columns 85 | result = set() 86 | if bins is None: 87 | if len(bin_names) > 0: 88 | bins = len(bin_names) 89 | else: 90 | raise ValueError("No bins nor bin_names have been provided") 91 | elif bins > 0: 92 | if len(bin_names) == 0: 93 | bin_names = range(0, bins) 94 | elif len(bin_names) == bins: 95 | pass 96 | else: 97 | raise ValueError("Mismatch among the provided amount of bins and the bin_names") 98 | else: 99 | raise ValueError("Negative amount of bins makes no sense") 100 | for feature in features: 101 | values = sorted(dataframe[feature]) 102 | intervals = [values[i * math.ceil(len(values) / bins)] for i in range(1, bins)] 103 | starting_interval: list[Value] = [LessThan(intervals[0])] 104 | ending_interval: list[Value] = [GreaterThan(intervals[-1])] 105 | middle_intervals: list[Value] = [Between(intervals[i], intervals[i + 1]) for i in range(0, len(intervals) - 1)] 106 | new_intervals = starting_interval + middle_intervals + ending_interval 107 | new_feature_names = [feature + '_' + str(i) for i in range(0, bins)] 108 | new_features = {new_feature_names[i]: new_intervals[i] for i in range(0, bins)} 109 | result.add(DiscreteFeature(feature, new_features)) 110 | return result 111 | 112 | 113 | def get_discrete_dataset(dataset: pd.DataFrame, discrete_features: Iterable[DiscreteFeature], 114 | sort: bool = True) -> pd.DataFrame: 115 | """ 116 | Create a new dataset mapping the old features into the new discrete features. 117 | Note: some algorithms require the same SORTED feature to be replicable due to rule optimization and other stuffs. 118 | Therefore the new features are alphabetically sorted. 119 | This is not strictly necessary because internally those algorithms perform the sorting themself. 120 | However it is a good idea to have this same function returning the same result w.r.t. the inputs. 121 | 122 | :param dataset: the original dataset 123 | :param discrete_features: mapping for the features 124 | :param sort: alphabetically sort new features 125 | :return: the new discrete dataset 126 | """ 127 | columns_name = [key for feature in discrete_features for key, _ in feature.admissible_values.items()] 128 | if sort: 129 | columns_name = sorted(columns_name) 130 | new_dataset = pd.DataFrame(columns=columns_name) 131 | for feature in discrete_features: 132 | for index, value in enumerate(dataset[feature.name]): 133 | for key, admissible_value in feature.admissible_values.items(): 134 | new_dataset.loc[index, key] = int(admissible_value.is_in(value)) 135 | 136 | for feature in discrete_features: 137 | for new_feature in feature.admissible_values.keys(): 138 | new_dataset[new_feature] = new_dataset[new_feature].astype(str).astype(int) 139 | 140 | return new_dataset 141 | 142 | 143 | def get_scaled_dataset(dataset: pd.DataFrame) -> tuple[pd.DataFrame, dict[str, tuple[float, float]]]: 144 | scaler = StandardScaler() 145 | scaler.fit(dataset) 146 | normalization = {key: (m, s) for key, m, s in zip(dataset.columns, scaler.mean_, scaler.scale_)} 147 | return pd.DataFrame(scaler.transform(dataset), columns=dataset.columns, index=dataset.index), normalization 148 | 149 | 150 | def scale_dataset(dataset: pd.DataFrame, normalization: dict[str, tuple[float, float]]) -> pd.DataFrame: 151 | new_data = pd.DataFrame() 152 | for column in dataset.columns: 153 | m, s = normalization[column] 154 | new_data[column] = (dataset[column] - m) / s 155 | return new_data 156 | 157 | 158 | class HashableDataFrame(pd.DataFrame): 159 | def __init__(self, obj): 160 | super().__init__(obj) 161 | 162 | def __hash__(self): 163 | hash_value = sha256(hash_pandas_object(self, index=True).values) 164 | hash_value = hash(hash_value.hexdigest()) 165 | return hash_value 166 | 167 | def __eq__(self, other): 168 | return self.equals(other) 169 | -------------------------------------------------------------------------------- /psyke/extraction/trepan/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from psyke.extraction import PedagogicalExtractor 3 | from psyke.extraction.trepan.utils import Node, Split, SplitLogic 4 | from psyke import DiscreteFeature 5 | from psyke.utils.logic import create_term, create_variable_list, create_head 6 | from psyke.utils.sorted import SortedList 7 | from tuprolog.core import Var, Struct, clause 8 | from tuprolog.theory import MutableTheory, mutable_theory, Theory 9 | from typing import Iterable, Union, Any 10 | import pandas as pd 11 | 12 | 13 | class Trepan(PedagogicalExtractor): 14 | 15 | def __init__(self, predictor, discretization: Iterable[DiscreteFeature], min_examples: int = 0, max_depth: int = 3, 16 | split_logic: SplitLogic = SplitLogic.DEFAULT): 17 | super().__init__(predictor, discretization) 18 | self._ignore_feature = [] 19 | self.min_examples = min_examples 20 | self.max_depth = max_depth 21 | self.split_logic = split_logic 22 | self._root: Node 23 | 24 | def make_fair(self, features: Iterable[str]): 25 | self._ignore_feature = [list(i.admissible_values.keys()) for i in self.discretization if i.name in features] \ 26 | if self.discretization else [features] 27 | self._ignore_feature = [feature for features in self._ignore_feature for feature in features] 28 | 29 | @property 30 | def n_rules(self): 31 | return sum(1 for _ in self._root) 32 | 33 | def _best_split(self, node: Node, names: Iterable[str]) -> Union[tuple[Node, Node], None]: 34 | if node.samples.shape[0] < self.min_examples: 35 | raise NotImplementedError() 36 | if node.n_classes == 1: 37 | return None 38 | splits = self._create_splits(node, names) 39 | return None if len(splits) == 0 or splits[0].children[0].depth > self.max_depth else splits[0].children 40 | 41 | def _compact(self): 42 | nodes = [self._root] 43 | while len(nodes) > 0: 44 | node = nodes.pop() 45 | for item in self._nodes_to_remove(node, nodes): 46 | node.children.remove(item) 47 | node.children += item.children 48 | 49 | def _create_body(self, variables: dict[str, Var], node: Node) -> Iterable[Struct]: 50 | result = [] 51 | for constraint, value in node.constraints: 52 | feature: DiscreteFeature = [d for d in self.discretization if constraint in d.admissible_values][0] 53 | result.append(create_term(variables[feature.name], feature.admissible_values[constraint], value == 1.0)) 54 | return result 55 | 56 | @staticmethod 57 | def _create_samples(node: Node, column: str, value: float) -> pd.DataFrame: 58 | return node.samples.loc[node.samples[column] == value] 59 | 60 | @staticmethod 61 | def _create_split(node: Node, column: str) -> Union[Split, None]: 62 | true_examples = Trepan._create_samples(node, column, 1.0) 63 | false_examples = Trepan._create_samples(node, column, 0.0) 64 | true_constraints = list(node.constraints) + [(column, 1.0)] 65 | false_constraints = list(node.constraints) + [(column, 0.0)] 66 | true_node = Node(true_examples, node.n_examples, true_constraints, depth=node.depth + 1) \ 67 | if true_examples.shape[0] > 0 else None 68 | false_node = Node(false_examples, node.n_examples, false_constraints, depth=node.depth + 1) \ 69 | if false_examples.shape[0] > 0 else None 70 | return None if true_node is None or false_node is None else Split(node, (true_node, false_node)) 71 | 72 | def _create_splits(self, node: Node, names: Iterable[str]) -> SortedList[Split]: 73 | splits, constraints = Trepan._init_splits(node) 74 | for column in [column for column in names if column not in list(constraints) + self._ignore_feature]: 75 | split = Trepan._create_split(node, column) 76 | if split is not None: 77 | splits.add(split) 78 | return splits 79 | 80 | def _create_theory(self, name: str) -> MutableTheory: 81 | theory = mutable_theory() 82 | for node in self._root: 83 | variables = create_variable_list(self.discretization) 84 | theory.assertZ( 85 | clause( 86 | create_head(name, list(variables.values()), str(node.dominant)), 87 | self._create_body(variables, node) 88 | ) 89 | ) 90 | return theory 91 | 92 | def _init(self, dateset: pd.DataFrame) -> SortedList[Node]: 93 | self._root = Node(dateset, dateset.shape[0]) 94 | queue: SortedList[Node] = SortedList(lambda x, y: int(x.priority - y.priority)) 95 | queue.add(self._root) 96 | return queue 97 | 98 | @staticmethod 99 | def _init_splits(node: Node) -> tuple[SortedList[Split], Iterable[str]]: 100 | return SortedList(lambda x, y: int(x.priority - y.priority)),\ 101 | set(constraint[0] for constraint in node.constraints) 102 | 103 | @staticmethod 104 | def _nodes_to_remove(node: Node, nodes: list[Node]) -> list[Node]: 105 | to_remove = [] 106 | for child in node.children: 107 | if node.dominant == child.dominant and len(child.children) == 1: 108 | to_remove.append(child) 109 | nodes.append(node) 110 | else: 111 | nodes.append(child) 112 | return to_remove 113 | 114 | @staticmethod 115 | def _internal_predict(x: pd.Series, node: Node, categories: Iterable) -> Any: 116 | for child in node.children: 117 | skip = False 118 | for constraint, value in child.constraints: 119 | if x[constraint] != value: 120 | skip = True 121 | continue 122 | if not skip: 123 | return Trepan._internal_predict(x, child, categories) 124 | return node.dominant 125 | 126 | def _optimize(self) -> None: 127 | n, nodes = 0, [self._root] 128 | while len(nodes) > 0: 129 | n += Trepan._remove_nodes(nodes) 130 | self._compact() if n == 0 else self._optimize() 131 | 132 | @staticmethod 133 | def _remove_nodes(nodes: list[Node]) -> int: 134 | node = nodes.pop() 135 | to_remove = [child for child in node.children if len(child.children) == 0 and node.dominant == child.dominant] 136 | for child in to_remove: 137 | node.children.remove(child) 138 | for child in node.children: 139 | if len(child.children) > 0: 140 | nodes.append(child) 141 | return len(to_remove) 142 | 143 | def _extract(self, dataframe: pd.DataFrame) -> Theory: 144 | queue = self._init(dataframe) 145 | while len(queue) > 0: 146 | node = queue.pop() 147 | if self.split_logic == SplitLogic.DEFAULT: 148 | best: Union[tuple[Node, Node], None] = self._best_split(node, dataframe.columns[:-1]) 149 | if best is None: 150 | continue 151 | else: 152 | raise Exception('Illegal split logic') 153 | queue.add_all(best) 154 | node.children += list(best) 155 | self._optimize() 156 | return self._create_theory(dataframe.columns[-1]) 157 | 158 | def _predict(self, dataframe: pd.DataFrame) -> Iterable: 159 | return np.array( 160 | [Trepan._internal_predict(sample, self._root, dataframe.columns[-1]) for _, sample in dataframe.iterrows()] 161 | ) 162 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PSyKE 2 | 3 | ![PSyKE Logo](.img/logo-wide.png) 4 | 5 | Quick links: 6 | * [Home Page](https://apice.unibo.it/xwiki/bin/view/PSyKE/) 7 | * [GitHub Repository](https://github.com/psykei/psyke-python) 8 | * [PyPi Repository](https://pypi.org/project/psyke/) 9 | * [Issues](https://github.com/psykei/psyke-python/issues) 10 | 11 | ## Latest Releases 12 | 13 | * PSyKE 1.0: Compatibility with Python 3.11.x 14 | * PSyKE 0.10: New genetic algorithms for knowledge extraction 15 | * PSyKE 0.9: Fairness mitigation support for knowedge extractors 16 | * PSyKE 0.8: New features: local explainability and counterfactual support 17 | * PSyKE 0.7: New SKE algorithms implemented 18 | 19 | ## Intro 20 | 21 | [PSyKE](https://apice.unibo.it/xwiki/bin/view/PSyKE/) (Platform for Symbolic Knowledge Extraction) 22 | is intended as a library for extracting symbolic knowledge (in the form of logic rule lists) out of sub-symbolic predictors. 23 | 24 | More precisely, PSyKE offers a general purpose API for knowledge extraction, and a number of different algorithms implementing it, 25 | supporting both classification and regression problems. 26 | The extracted knowledge consists of a Prolog theory (i.e., a list of Horn clauses) or an OWL ontology containing SWRL rules. 27 | 28 | PSyKE relies on [2ppy](https://github.com/tuProlog/2ppy) (tuProlog in Python) for logic support, which in turn is based on the [2p-Kt](https://github.com/tuProlog/2p-kt) logic ecosystem. 29 | 30 | ### Class diagram overview: 31 | 32 | ![PSyKE class diagram](http://www.plantuml.com/plantuml/svg/PLBBRkem4DtdAqQixeLcqsN40aHfLQch2dM341gS0IpoY3oJYfJctnl7RkgcKZRdCUFZ4ozOq4YTPr65we8dWlkgQcuHmEPCfMbW6iDaEe5LXZLJr4QHof3PgxVMGoTtS5XJSNCXkwVxlhdUguzQeUYoi28u3bxNovS0RWnLM7H46mNZXaw6c4UZpq8cW4z6ftGTZoeq4WwjB6x7BbPdoZ7qFMXMXeGU2QKsv2I06HmTiIymfmHOpA1WccjcVSXe_uvPJPn0gfLiEyyTl5bcrtk7qzTNCQYaDBxhyQ6_BFFFEExJ_sLzXoFMLpdcVMrZrhVNvS83zygFmrv-1fMXL5lOezH5rH_z7qqWqonRbn-72-nwAxaz_r8KP9B_YNz3uTP0jFcmAt6xB9gT3UJSC8_Z87G2PIrLBL0UemKLQPrdNm00) 33 | 34 | 37 | 38 | PSyKE is designed around the notion of _extractor_. 39 | More precisely, an `Extractor` is any object capable of extracting a logic `Theory` out of a trained sub-symbolic regressor or classifier. 40 | Accordingly, an `Extractor` is composed of 41 | _(i)_ a trained predictor (i.e., black-box used as an oracle) and 42 | _(ii)_ a set of feature descriptors, and it provides two methods: 43 | * `extract`: returns a logic theory given a dataset; 44 | * `predict`: predicts a value using the extracted rules (instead of the original predictor). 45 | 46 | Currently, the supported extraction algorithms are: 47 | * [CART](https://doi.org/10.1201/9781315139470), 48 | straightforward extracts rules from both classification and regression decision trees; 49 | * Classification: 50 | * [REAL](http://dx.doi.org/10.1016/B978-1-55860-335-6.50013-1) (Rule Extraction As Learning), 51 | generates and generalizes rules strarting from dataset samples; 52 | * [Trepan](http://dx.doi.org/10.1016/B978-1-55860-335-6.50013-1), 53 | generates rules by inducing a decision tree and possibly exploiting m-of-n expressions; 54 | * Regression: 55 | * [ITER](http://dx.doi.org/10.1007/11823728_26), 56 | builds and iteratively expands hypercubes in the input space. 57 | Each cube holds a constant value, that is the estimated output for the samples inside the cube; 58 | * [GridEx](http://dx.doi.org/10.1007/978-3-030-82017-6_2), 59 | extension of the ITER algorithm that produces shorter rule lists retaining higher fidelity w.r.t. the predictor. 60 | * GridREx, 61 | extension of GridEx where the output of each hypercube is a linear combination of the input variables and not a constant value. 62 | 63 | Users may exploit the PEDRO algorithm, included in PSyKE, to tune the optimal values for GridEx and GridREx hyper-parameters. 64 | 65 | We are working on PSyKE to extend its features to encompass explainable clustering tasks, as well as to make more general-purpose the supported extraction algorithms (e.g., by adding classification support to GridEx and GridREx). 66 | 67 | ## Users 68 | 69 | ### End users 70 | 71 | PSyKE is deployed as a library on Pypi. It can be installed as Python package by running: 72 | ```bash 73 | pip install psyke 74 | ``` 75 | 76 | #### Requirements 77 | 78 | Please refer to the [requirements file](https://github.com/psykei/psyke-python/blob/master/requirements.txt) 79 | 80 | ##### Test requirements 81 | * `skl2onnx` 82 | * `onnxruntime` 83 | * `parameterized` 84 | 85 | Once installed, it is possible to create an extractor from a predictor 86 | (e.g. Neural Network, Support Vector Machine, K-Nearest Neighbours, Random Forest, etc.) 87 | and from the data set used to train the predictor. 88 | 89 | > **Note:** the predictor must expose a method named `predict` to be properly used as an oracle. 90 | 91 | #### End users 92 | 93 | A brief example is presented in `demo.py` script in the `demo/` folder. 94 | Using `sklearn`'s Iris data set we train a K-Nearest Neighbours to predict the correct output class. 95 | Before training, we make the dataset discrete. 96 | After that we create two different extractors: REAL and Trepan. 97 | We output the extracted theory for both extractors. 98 | 99 | REAL extracted rules: 100 | ``` 101 | iris(PetalLength, PetalWidth, SepalLength, SepalWidth, setosa) :- PetalWidth =< 1.0. 102 | iris(PetalLength1, PetalWidth1, SepalLength1, SepalWidth1, versicolor) :- PetalLength1 > 4.9, SepalWidth1 in [2.9, 3.2]. 103 | iris(PetalLength2, PetalWidth2, SepalLength2, SepalWidth2, versicolor) :- PetalWidth2 > 1.6. 104 | iris(PetalLength3, PetalWidth3, SepalLength3, SepalWidth3, virginica) :- SepalWidth3 =< 2.9. 105 | iris(PetalLength4, PetalWidth4, SepalLength4, SepalWidth4, virginica) :- SepalLength4 in [5.4, 6.3]. 106 | iris(PetalLength5, PetalWidth5, SepalLength5, SepalWidth5, virginica) :- PetalWidth5 in [1.0, 1.6]. 107 | ``` 108 | 109 | Trepan extracted rules: 110 | ``` 111 | iris(PetalLength6, PetalWidth6, SepalLength6, SepalWidth6, virginica) :- PetalLength6 > 3.0, PetalLength6 in [3.0, 4.9]. 112 | iris(PetalLength7, PetalWidth7, SepalLength7, SepalWidth7, versicolor) :- PetalLength7 > 3.0. 113 | iris(PetalLength8, PetalWidth8, SepalLength8, SepalWidth8, setosa) :- true. 114 | ``` 115 | 116 | 117 | ## Developers 118 | 119 | Working with PSyKE codebase requires a number of tools to be installed: 120 | * Python 3.11 121 | + Python version >= `3.12.x` are currently __not__ supported 122 | 123 | * JDK 11+ (please ensure the `JAVA_HOME` environment variable is properly configured) 124 | * Git 2.20+ 125 | 126 | ### Develop PSyKE with PyCharm 127 | 128 | To participate in the development of PSyKE, we suggest the [PyCharm](https://www.jetbrains.com/pycharm/) IDE. 129 | 130 | #### Importing the project 131 | 132 | 1. Clone this repository in a folder of your preference using `git_clone` appropriately 133 | 2. Open PyCharm 134 | 3. Select `Open` 135 | 4. Navigate your file system and find the folder where you cloned the repository 136 | 5. Click `Open` 137 | 138 | ### Developing the project 139 | 140 | Contributions to this project are welcome. Just some rules: 141 | * We use [git flow](https://github.com/nvie/gitflow), so if you write new features, please do so in a separate `feature/` branch 142 | * We recommend forking the project, developing your code, then contributing back via pull request 143 | * Commit often 144 | * Stay in sync with the `develop` (or `master`) branch (pull frequently if the build passes) 145 | * Do not introduce low quality or untested code 146 | 147 | #### Issue tracking 148 | If you meet some problems in using or developing PSyKE, you are encouraged to signal it through the project 149 | ["Issues" section](https://github.com/psykei/psyke-python/issues) on GitHub. 150 | -------------------------------------------------------------------------------- /psyke/utils/plot.py: -------------------------------------------------------------------------------- 1 | from array import array 2 | from typing import Callable, Iterable 3 | import numpy as np 4 | import pandas as pd 5 | from matplotlib import colors 6 | import matplotlib.pyplot as plt 7 | from matplotlib.lines import Line2D 8 | from tuprolog.solve.prolog import prolog_solver 9 | from tuprolog.theory import Theory, mutable_theory 10 | 11 | from psyke.extraction.hypercubic import HyperCubeExtractor 12 | from psyke.utils.logic import data_to_struct, get_in_rule, get_not_in_rule 13 | 14 | import matplotlib 15 | #matplotlib.use('TkAgg') 16 | 17 | 18 | def plot_init(xlim, ylim, xlabel, ylabel, size=(4, 3), equal=False): 19 | plt.figure(figsize=size) 20 | if equal: 21 | plt.gca().set_aspect(1) 22 | plt.xlim(xlim) 23 | plt.ylim(ylim) 24 | plt.gca().set_xlabel(xlabel) 25 | plt.gca().set_ylabel(ylabel) 26 | plt.gca().set_rasterized(True) 27 | 28 | 29 | def plot_point(x, y, color, marker, ec=None): 30 | plt.scatter(x, y, c=color, marker=marker, edgecolors=ec, linewidths=0.6) 31 | 32 | 33 | def plot_classification_samples(dataframe, classes, colors, markers, labels, loc, name, show=True, ec=None): 34 | marks = [Line2D([0], [0], color=c, marker=m, lw="0") for c, m in zip(colors, markers)] 35 | 36 | for cl, c, m in zip(classes, colors, markers): 37 | df = dataframe[dataframe.target == cl] 38 | plot_point(df["petal length"], df["petal width"], c, m, ec=ec) 39 | 40 | plt.gca().legend(marks, labels, loc=loc) 41 | plt.savefig("plot/{}.pdf".format(name), dpi=500, bbox_inches='tight') 42 | if show: 43 | plt.show() 44 | 45 | 46 | def plot_boundaries(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str], 47 | a: float = .5, h: str = '////////', ls='-', e=.05, fc='none', ec=None, reverse=False): 48 | cubes = extractor._hypercubes.copy() 49 | if reverse: 50 | cubes.reverse() 51 | for cube in cubes: 52 | plt.gca().fill_between((cube[x][0] - e, cube[x][1] + e), cube[y][0] - e, cube[y][1] + e, 53 | fc=colors[cube.output] if fc is None else fc, 54 | ec=colors[cube.output] if ec is None else ec, alpha=a, hatch=h, linestyle=ls) 55 | 56 | 57 | def plot_surfaces(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str], ec='r', e=.05): 58 | for cube in extractor._hypercubes: 59 | plt.gca().fill_between((cube[x][0] - e, cube[x][1] + e), cube[y][0] - e, cube[y][1] + e, 60 | fc='none', ec=ec) 61 | 62 | 63 | def plot_perimeters(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str], n: int = 5, 64 | ec: str = 'r', m: str = '*', s: int = 60, z: float = 1e10, lw: float = 0.8): 65 | for cube in extractor._hypercubes: 66 | for corner in cube.perimeter_samples(n): 67 | plt.scatter(corner[x], corner[y], c=colors[cube.output], marker=m, edgecolor=ec, s=s, zorder=z, linewidth=lw) 68 | 69 | 70 | def plot_centers(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str], 71 | ec: str = 'r', m: str = '*', s: int = 60, z: float = 1e10, lw: float = 0.8): 72 | for cube in extractor._hypercubes: 73 | center = cube.center 74 | plt.scatter(center[x], center[y], c=colors[cube.output], marker=m, edgecolor=ec, s=s, zorder=z, linewidth=lw) 75 | 76 | 77 | def plot_corners(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str], 78 | ec: str = 'r', m: str = '*', s: int = 60, z: float = 1e10, lw: float = 0.8): 79 | for cube in extractor._hypercubes: 80 | for corner in cube.corners(): 81 | plt.scatter(corner[x], corner[y], c=colors[cube.output], marker=m, edgecolor=ec, s=s, zorder=z, linewidth=lw) 82 | 83 | 84 | def plot_barycenters(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str], 85 | ec: str = 'r', m: str = '*', s: int = 60, z: float = 1e10, lw: float = 0.8): 86 | for cube in extractor._hypercubes: 87 | center = cube.barycenter 88 | plt.scatter(center[x], center[y], c=colors[cube.output], marker=m, edgecolor=ec, s=s, zorder=z, linewidth=lw) 89 | 90 | 91 | def predict_from_theory(theory: Theory, data: pd.DataFrame) -> list[float or str]: 92 | solver = prolog_solver(static_kb=mutable_theory(theory).assertZ(get_in_rule()).assertZ(get_not_in_rule())) 93 | index = data.shape[1] - 1 94 | y_element = data.iloc[0, -1] 95 | cast: Callable = lambda x: (str(x) if isinstance(y_element, str) else x) 96 | substitutions = [solver.solveOnce(data_to_struct(data)) for _, data in data.iterrows()] 97 | return [cast(query.solved_query.get_arg_at(index)) if query.is_yes else -1 for query in substitutions] 98 | 99 | 100 | def plot_theory(theory: Theory, data: pd.DataFrame = None, output: str = 'plot.pdf', azimuth: float = 45, 101 | distance: float = 9, elevation: float = 5, show_theory: bool = True, features: Iterable[str] = None) -> None: 102 | # Check if the number of common variables in clauses is less or equal to three. 103 | # If not raise an exception. 104 | fresh_theory = mutable_theory(theory) 105 | clauses = fresh_theory.clauses 106 | variables = sorted(list(set(arg.args[0].name.split('_')[0] for clause in clauses if clause.body_size > 0 and clause.body.is_recursive for arg in clause.body.unfolded)), reverse=True) 107 | if len(variables) > 3: 108 | raise Exception("Theory contains too many different features in the body of clauses, maximum is 3.") 109 | # If data is None, then create synthetic data covering a good portion of the variables space. 110 | # Just skip for now. 111 | if data is None: 112 | raise Exception("Method without data is not implemented yet") 113 | 114 | # Prepare data 115 | ys = predict_from_theory(fresh_theory, data) 116 | xs = data[variables].values.tolist() 117 | for i in range(len(ys)): 118 | xs[i].append(ys[i]) 119 | 120 | # Prepare colors 121 | if isinstance(ys[0], str): 122 | np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning) 123 | 124 | class ColorGenerator: 125 | 126 | def __init__(self): 127 | self.color_list = ['red', 'royalblue', 'green', 'orange', 'pink', 'acqua', 'grey'] 128 | self.counter = 0 129 | 130 | def get_new_color(self) -> str: 131 | self.counter += 1 132 | if self.counter > len(self.color_list): 133 | raise Exception("Classes exceed the maximum supported number (7)") 134 | return self.color_list[self.counter - 1] 135 | 136 | classes = set(ys) 137 | generator = ColorGenerator() 138 | class_color = {c: generator.get_new_color() for c in classes} 139 | get_color: Callable = lambda c: class_color[c] 140 | else: 141 | def color_fader(v: float = 0., c1: str = 'green', c2: str = 'red'): 142 | c1 = array(colors.to_rgb(c1)) 143 | c2 = array(colors.to_rgb(c2)) 144 | return colors.to_hex((1 - v) * c1 + v * c2) 145 | min_value = min(ys) 146 | max_value = max(ys) 147 | get_normalized_value: Callable = lambda v: (v - min_value)/(max_value - min_value) 148 | get_color: Callable = lambda c: color_fader(get_normalized_value(c)) 149 | 150 | fig = plt.figure() 151 | fig.set_size_inches(10, 10) 152 | if len(variables) == 3: 153 | ax = fig.add_subplot(projection='3d') 154 | else: 155 | ax = fig.add_subplot() 156 | 157 | for x in xs: 158 | ax.scatter(*x[:-1], c=get_color(x[-1]), s=14) 159 | 160 | ax.set_xlabel(variables[0], fontsize=18) 161 | ax.set_ylabel(variables[1], fontsize=18) 162 | if len(variables) == 3: 163 | ax.set_zlabel(variables[2], fontsize=18) 164 | 165 | ax.azim = azimuth 166 | ax.dist = distance 167 | ax.elev = elevation 168 | ax.set_title('Predictions according to Prolog theory', fontsize=24) 169 | if show_theory: 170 | pass 171 | # ax.text2D(0., 0.88, pretty_theory(theory, new_line=False), transform=ax.transAxes, fontsize=8) 172 | if isinstance(ys[0], str): 173 | custom_lines = [Line2D([0], [0], marker='o', markerfacecolor=get_color(c), 174 | markersize=20, color='w') for c in classes] 175 | ax.legend(custom_lines, classes, loc='upper left', numpoints=1, ncol=3, fontsize=18, bbox_to_anchor=(0, 0)) 176 | plt.savefig(output, format='pdf') 177 | -------------------------------------------------------------------------------- /psyke/extraction/cart/FairTree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import Counter 3 | 4 | from sklearn.metrics import accuracy_score, r2_score 5 | 6 | 7 | class Node: 8 | def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None): 9 | self.feature = feature 10 | self.threshold = threshold 11 | self.left = left 12 | self.right = right 13 | self.value = value 14 | 15 | def is_leaf_node(self): 16 | return self.value is not None 17 | 18 | 19 | class FairTree: 20 | def __init__(self, max_depth=3, max_leaves=None, criterion=None, min_samples_split=2, lambda_penalty=0.0, 21 | protected_attr=None): 22 | self.max_depth = max_depth 23 | self.max_leaves = max_leaves 24 | self.min_samples_split = min_samples_split 25 | self.lambda_penalty = lambda_penalty 26 | self.protected_attr = protected_attr 27 | self.criterion = criterion 28 | self.root = None 29 | self.n_leaves = 0 30 | self.quality_function = None 31 | 32 | def fit(self, X, y): 33 | self.n_leaves = 0 34 | self.root = self._grow_tree(X, y, depth=0) 35 | while self.n_leaves > self.max_leaves: 36 | self.prune_least_important_leaf(X, y) 37 | self.n_leaves -= 1 38 | return self 39 | 40 | @staticmethod 41 | def _estimate_output(y): 42 | raise NotImplementedError 43 | 44 | def score(self, X, y): 45 | raise NotImplementedError 46 | 47 | def predict(self, X): 48 | return np.array([self._traverse_tree(x, self.root) for _, x in X.iterrows()]) 49 | 50 | def _traverse_tree(self, x, node): 51 | if node.is_leaf_node(): 52 | return node.value 53 | if x[node.feature] <= node.threshold: 54 | return self._traverse_tree(x, node.left) 55 | return self._traverse_tree(x, node.right) 56 | 57 | def _grow_tree(self, X, y, depth): 58 | if depth >= self.max_depth or X.shape[0] < self.min_samples_split or len(set(y.values.flatten())) == 1 or \ 59 | (self.max_leaves is not None and self.n_leaves >= self.max_leaves): 60 | self.n_leaves += 1 61 | return Node(value=self._estimate_output(y)) 62 | 63 | best_feature, best_threshold = self._best_split(X, y) 64 | if best_feature is None: 65 | self.n_leaves += 1 66 | return Node(value=self._estimate_output(y)) 67 | 68 | left_idxs = X[best_feature] <= best_threshold 69 | right_idxs = X[best_feature] > best_threshold 70 | 71 | left = self._grow_tree(X[left_idxs], y[left_idxs], depth + 1) 72 | right = self._grow_tree(X[right_idxs], y[right_idxs], depth + 1) 73 | return Node(best_feature, best_threshold, left, right) 74 | 75 | @staticmethod 76 | def generate_thresholds(X, y): 77 | sorted_indices = np.argsort(X) 78 | X = np.array(X)[sorted_indices] 79 | y = np.array(y)[sorted_indices] 80 | # X = np.array(np.unique(np.unique(list(zip(X, y)), axis=0)[:, 0]), dtype=float) 81 | return np.array([(X[:-1][i] + X[1:][i]) / 2.0 for i in range(len(X) - 1) if y[i] != y[i + 1]]) 82 | 83 | def _best_split(self, X, y): 84 | best_gain = -float('inf') 85 | split_idx, split_threshold = None, None 86 | 87 | for feature in [feature for feature in X.columns if feature not in self.protected_attr]: 88 | # for threshold in self.generate_thresholds(X[feature], y): 89 | for threshold in np.unique(np.quantile(X[feature], np.linspace(0, 1, num=25))): 90 | left_idxs = X[feature] <= threshold 91 | right_idxs = X[feature] > threshold 92 | 93 | if left_idxs.sum() == 0 or right_idxs.sum() == 0: 94 | continue 95 | 96 | gain = self._fair_gain(y, left_idxs, right_idxs, X[self.protected_attr]) 97 | 98 | if gain > best_gain: 99 | best_gain = gain 100 | split_idx = feature 101 | split_threshold = threshold 102 | return split_idx, split_threshold 103 | 104 | @staticmethod 105 | def _disparity(group): 106 | counts = Counter(group) 107 | if len(counts) <= 1: 108 | return 0.0 109 | values = np.array(list(counts.values())) / len(group) 110 | return np.abs(values[0] - values[1]) 111 | 112 | def _fair_gain(self, y, left_idx, right_idx, protected): 113 | child = len(y[left_idx]) / len(y) * self.quality_function(y[left_idx]) + \ 114 | len(y[right_idx]) / len(y) * self.quality_function(y[right_idx]) 115 | info_gain = self.quality_function(y) - child 116 | penalty = self._disparity(protected[left_idx]) + self._disparity(protected[right_idx]) 117 | return info_gain - self.lambda_penalty * penalty 118 | 119 | @staticmethod 120 | def _match_path(x, path): 121 | for node, left in path: 122 | if left and x[node.feature] > node.threshold: 123 | return False 124 | if not left and x[node.feature] <= node.threshold: 125 | return False 126 | return True 127 | 128 | @staticmethod 129 | def candidates(node, parent=None, is_left=None, path=[]): 130 | if node is None or node.is_leaf_node(): 131 | return [] 132 | leaves = [] 133 | if node.left.is_leaf_node() and node.right.is_leaf_node(): 134 | leaves.append((node, parent, is_left, path)) 135 | leaves += FairTreeClassifier.candidates(node.left, node, True, path + [(node, True)]) 136 | leaves += FairTreeClassifier.candidates(node.right, node, False, path + [(node, False)]) 137 | return leaves 138 | 139 | def prune_least_important_leaf(self, X, y): 140 | best_score = -np.inf 141 | best_prune = None 142 | 143 | for node, parent, is_left, path in self.candidates(self.root): 144 | original_left = node.left 145 | original_right = node.right 146 | 147 | merged_y = y[(X.apply(lambda x: self._match_path(x, path), axis=1))] 148 | if len(merged_y) == 0: 149 | continue 150 | new_value = self._estimate_output(merged_y) 151 | node.left = node.right = None 152 | node.value = new_value 153 | 154 | score = self.score(X, y) 155 | if score >= best_score: 156 | best_score = score 157 | best_prune = (node, new_value) 158 | 159 | node.left, node.right, node.value = original_left, original_right, None 160 | 161 | if best_prune: 162 | best_prune[0].left = best_prune[0].right = None 163 | best_prune[0].value = best_prune[1] 164 | 165 | 166 | class FairTreeClassifier(FairTree): 167 | def __init__(self, max_depth=3, max_leaves=None, criterion='entropy', min_samples_split=2, lambda_penalty=0.0, 168 | protected_attr=None): 169 | super().__init__(max_depth, max_leaves, criterion, min_samples_split, lambda_penalty, protected_attr) 170 | self.quality_function = self._gini if self.criterion == 'gini' else self._entropy 171 | 172 | @staticmethod 173 | def _estimate_output(y): 174 | return Counter(y.values.flatten()).most_common(1)[0][0] 175 | 176 | def score(self, X, y): 177 | return accuracy_score(y.values.flatten(), self.predict(X)) 178 | 179 | @staticmethod 180 | def _entropy(y): 181 | ps = np.unique(y, return_counts=True)[1] / len(y) 182 | return -np.sum([p * np.log2(p) for p in ps if p > 0]) 183 | 184 | @staticmethod 185 | def _gini(y): 186 | return 1.0 - np.sum(np.unique(y, return_counts=True)[1] / len(y)**2) 187 | 188 | 189 | class FairTreeRegressor(FairTree): 190 | def __init__(self, max_depth=3, max_leaves=None, criterion='mse', min_samples_split=2, lambda_penalty=0.0, 191 | protected_attr=None): 192 | super().__init__(max_depth, max_leaves, criterion, min_samples_split, lambda_penalty, protected_attr) 193 | self.quality_function = self._mse 194 | 195 | @staticmethod 196 | def _estimate_output(y): 197 | return np.mean(y.values.flatten()) 198 | 199 | def score(self, X, y): 200 | return r2_score(y.values.flatten(), self.predict(X)) 201 | 202 | @staticmethod 203 | def _mse(y): 204 | y = y.values.flatten().astype(float) 205 | return np.mean((y - np.mean(y))**2) 206 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import pathlib 3 | import subprocess 4 | import distutils.cmd 5 | 6 | here = pathlib.Path(__file__).parent.resolve() 7 | 8 | version_file = here / 'VERSION' 9 | 10 | # Get the long description from the README file 11 | long_description = (here / 'README.md').read_text(encoding='utf-8') 12 | 13 | 14 | EPOCHS: int = 50 15 | BATCH_SIZE: int = 16 16 | REQUIREMENTS = [ 17 | 'numpy~=2.3.4', 18 | 'pandas~=2.3.0', 19 | 'scikit-learn~=1.8.0', 20 | '2ppy~=0.4.0', 21 | 'kneed~=0.8.1', 22 | 'sympy~=1.11' 23 | ] # Optional 24 | 25 | 26 | def format_git_describe_version(version): 27 | if '-' in version: 28 | splitted = version.split('-') 29 | tag = splitted[0] 30 | index = f"dev{splitted[1]}" 31 | return f"{tag}.{index}" 32 | else: 33 | return version 34 | 35 | 36 | def get_version_from_git(): 37 | try: 38 | process = subprocess.run(["git", "describe"], cwd=str(here), check=True, capture_output=True) 39 | version = process.stdout.decode('utf-8').strip() 40 | version = format_git_describe_version(version) 41 | with version_file.open('w') as f: 42 | f.write(version) 43 | return version 44 | except subprocess.CalledProcessError: 45 | if version_file.exists(): 46 | return version_file.read_text().strip() 47 | else: 48 | return '0.1.0.archeo' 49 | 50 | 51 | version = get_version_from_git() 52 | 53 | 54 | print(f"Detected version {version} from git describe") 55 | 56 | 57 | class GetVersionCommand(distutils.cmd.Command): 58 | """A custom command to get the current project version inferred from git describe.""" 59 | 60 | description = 'gets the project version from git describe' 61 | user_options = [] 62 | 63 | def initialize_options(self): 64 | pass 65 | 66 | def finalize_options(self): 67 | pass 68 | 69 | def run(self): 70 | print(version) 71 | 72 | 73 | #class CreateTestPredictors(distutils.cmd.Command): 74 | # description = 'gets the project version from git describe' 75 | # user_options = [] 76 | 77 | # def initialize_options(self): 78 | # pass 79 | 80 | # def finalize_options(self): 81 | # pass 82 | 83 | # def run(self): 84 | # from psyke.utils import get_default_random_seed 85 | # from psyke.utils.dataframe import get_discrete_dataset 86 | # from sklearn.model_selection import train_test_split 87 | # from test import REQUIRED_PREDICTORS, get_dataset, get_model, get_schema 88 | # from test.resources.predictors import get_predictor_path, PATH, create_predictor_name 89 | # import ast 90 | # import pandas as pd 91 | # from tensorflow.keras import Model 92 | # from test import Predictor 93 | 94 | # Read the required predictors to run the tests: 95 | # model | model_options | dataset 96 | # required_predictors = pd.read_csv(REQUIRED_PREDICTORS, sep=';') 97 | 98 | # Create missing predictors. 99 | # model | model_options | dataset 100 | # for index, row in required_predictors.iterrows(): 101 | # options = ast.literal_eval(row['model_options']) 102 | # file_name = create_predictor_name(row['dataset'], row['model'], options) 103 | # if not get_predictor_path(file_name).is_file(): 104 | # dataset = get_dataset(row['dataset']) 105 | # if row['bins'] > 0: 106 | # schema = get_schema(dataset) # int(row['bins']) 107 | # dataset = get_discrete_dataset(dataset.iloc[:, :-1], schema).join(dataset.iloc[:, -1]) 108 | # model, _ = get_model(row['model'], options) 109 | # training_set, test_set = train_test_split(dataset, test_size=0.5, 110 | # random_state=get_default_random_seed()) 111 | # if isinstance(model, Model): 112 | # keys = set(training_set.iloc[:, -1]) 113 | # mapping = {key: i for i, key in enumerate(keys)} 114 | # training_set.iloc[:, -1] = training_set.iloc[:, -1].apply(lambda x: mapping[x]) 115 | # test_set.iloc[:, -1] = test_set.iloc[:, -1].apply(lambda x: mapping[x]) 116 | # model.fit(training_set.iloc[:, :-1], training_set.iloc[:, -1], epochs=EPOCHS, batch_size=BATCH_SIZE) 117 | # else: 118 | # model.fit(training_set.iloc[:, :-1], training_set.iloc[:, -1]) 119 | # predictor = Predictor(model) 120 | # predictor.save_to_onnx(PATH / file_name, Predictor.get_initial_types(training_set.iloc[:, :-1])) 121 | 122 | # required_predictors.to_csv(REQUIRED_PREDICTORS, sep=';', index=False) 123 | 124 | # print("Done") 125 | 126 | 127 | class CreateTheoryPlot(distutils.cmd.Command): 128 | description = 'create a plot representing samples X and their class/regression value Y predicted by a theory' 129 | user_options = [('theory=', 't', 'textual file of a Prolog theory'), 130 | ('dataset=', 'd', 'file of a dataset'), 131 | ('azimuth=', 'a', 'azimuth of the plot'), 132 | ('distance=', 'D', 'distance from the plot'), 133 | ('elevation=', 'e', 'elevation of the plot'), 134 | ('output=', 'o', 'output file name of the plot'), 135 | ('show=', 's', 'show theory in the plot ([y]/n)'), 136 | ] 137 | default_output_file_name = 'dummy/plot' 138 | default_theory_name = 'dummy/iris-theory' 139 | default_dataset_name = 'dummy/iris' 140 | default_azimuth = '45' 141 | default_distance = '9' 142 | default_elevation = '5' 143 | csv_format = '.csv' 144 | txt_format = '.txt' 145 | pdf_format = '.pdf' 146 | 147 | def initialize_options(self): 148 | self.output = self.default_output_file_name 149 | self.theory = self.default_theory_name 150 | self.dataset = self.default_dataset_name 151 | self.azimuth = self.default_azimuth 152 | self.elevation = self.default_elevation 153 | self.distance = self.default_distance 154 | self.show = True 155 | 156 | def finalize_options(self): 157 | self.theory_file = str(self.theory) 158 | self.data = str(self.dataset) 159 | self.output = str(self.output) 160 | self.a = float(self.azimuth) 161 | self.e = float(self.elevation) 162 | self.d = float(self.distance) 163 | self.s = self.show in (True, 'y', 'Y', 'yes', 'YES', 'Yes') 164 | 165 | def run(self): 166 | import pandas as pd 167 | from tuprolog.theory.parsing import parse_theory 168 | from psyke.utils.plot import plot_theory 169 | 170 | if self.theory_file is None or self.theory_file == '': 171 | raise Exception('Empty theory file name') 172 | if self.data is None or self.data == '': 173 | raise Exception('Empty dataset file name') 174 | with open(self.theory_file + (self.txt_format if '.' not in self.theory_file else ''), 'r') as file: 175 | textual_theory = file.read() 176 | theory = parse_theory(textual_theory) 177 | data = pd.read_csv(self.data + (self.csv_format if '.' not in self.data else '')) 178 | plot_theory(theory, data, self.output + self.pdf_format, self.a, self.d, self.e, self.s) 179 | 180 | 181 | setup( 182 | name='psyke', # Required 183 | version=version, 184 | description='Python-based implementation of PSyKE, i.e. a Platform for Symbolic Knowledge Extraction', 185 | license='Apache 2.0 License', 186 | long_description=long_description, 187 | long_description_content_type='text/markdown', 188 | url='https://github.com/psykei/psyke-python', 189 | author='Matteo Magnini', 190 | author_email='matteo.magnini@unibo.it', 191 | classifiers=[ 192 | 'Development Status :: 3 - Alpha', 193 | 'Intended Audience :: Developers', 194 | 'Topic :: Software Development :: Libraries', 195 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 196 | 'License :: OSI Approved :: Apache Software License', 197 | 'Programming Language :: Python :: 3', 198 | 'Programming Language :: Python :: 3.11', 199 | 'Programming Language :: Python :: 3 :: Only', 200 | 'Programming Language :: Prolog' 201 | ], 202 | keywords='knowledge extraction, symbolic ai, ske, extractor, rules, prolog', # Optional 203 | # package_dir={'': 'src'}, # Optional 204 | packages=find_packages('.'), # Required 205 | include_package_data=True, 206 | python_requires='==3.11', 207 | install_requires=REQUIREMENTS, # Optional 208 | zip_safe=False, 209 | platforms="Independant", 210 | project_urls={ # Optional 211 | 'Bug Reports': 'https://github.com/psykei/psyke-python/issues', 212 | # 'Funding': 'https://donate.pypi.org', 213 | # 'Say Thanks!': 'http://saythanks.io/to/example', 214 | 'Source': 'https://github.com/psykei/psyke-python', 215 | }, 216 | cmdclass={ 217 | 'get_project_version': GetVersionCommand, 218 | # 'create_test_predictors': CreateTestPredictors, 219 | 'create_theory_plot': CreateTheoryPlot 220 | }, 221 | ) 222 | -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/iter/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Iterable 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.base import ClassifierMixin 6 | from tuprolog.theory import Theory 7 | from psyke.extraction.hypercubic import HyperCube, HyperCubeExtractor 8 | from psyke.extraction.hypercubic.hypercube import GenericCube 9 | from psyke.extraction.hypercubic.utils import MinUpdate, Expansion 10 | from psyke.utils import get_default_random_seed, Target 11 | 12 | 13 | class ITER(HyperCubeExtractor): 14 | """ 15 | Explanator implementing ITER algorithm, doi:10.1007/11823728_26. 16 | """ 17 | 18 | def __init__(self, predictor, min_update, n_points, max_iterations, min_examples, threshold, fill_gaps, 19 | ignore_dimensions: Iterable, normalization, output: Target = Target.CONSTANT, 20 | seed=get_default_random_seed()): 21 | super().__init__(predictor, output, normalization=normalization) 22 | if output is Target.REGRESSION: 23 | raise NotImplementedError 24 | self.predictor = predictor 25 | self.min_update = min_update 26 | self._init_points = n_points 27 | self.n_points = n_points 28 | self.max_iterations = max_iterations 29 | self.min_examples = min_examples 30 | self.threshold = threshold 31 | self.fill_gaps = fill_gaps 32 | self._output = Target.CLASSIFICATION if isinstance(predictor, ClassifierMixin) else \ 33 | output if output is not None else Target.CONSTANT 34 | self.seed = seed 35 | self.ignore_dimensions = ignore_dimensions if ignore_dimensions is not None else [] 36 | 37 | def make_fair(self, features: Iterable[str]): 38 | self.n_points = self._init_points 39 | self.ignore_dimensions += list(features) 40 | 41 | def _best_cube(self, dataframe: pd.DataFrame, cube: GenericCube, cubes: Iterable[Expansion]) -> Expansion | None: 42 | expansions = [] 43 | for limit in cubes: 44 | count = limit.cube.count(dataframe) 45 | dataframe = pd.concat([dataframe, limit.cube.create_samples(self.min_examples - count)]) 46 | limit.cube.update(dataframe, self.predictor) 47 | expansions.append(Expansion( 48 | limit.cube, limit.feature, limit.direction, 49 | abs(cube.output - limit.cube.output) if self._output is Target.CONSTANT else 50 | 1 - int(cube.output == limit.cube.output) 51 | )) 52 | if len(expansions) > 0: 53 | return sorted(expansions, key=lambda e: e.distance)[0] 54 | return None 55 | 56 | def _calculate_min_updates(self) -> Iterable[MinUpdate]: 57 | return [MinUpdate(name, (interval[1] - interval[0]) * self.min_update) for (name, interval) in 58 | self._surrounding.dimensions.items()] 59 | 60 | def _create_range(self, cube: GenericCube, min_updates: Iterable[MinUpdate], feature: str, direction: str)\ 61 | -> tuple[GenericCube, tuple[float, float]]: 62 | a, b = cube[feature] 63 | size = [min_update for min_update in min_updates if min_update.name == feature][0].value 64 | return (cube.copy(), (max(a - size, self._surrounding.get_first(feature)), a) 65 | if direction == '-' else (b, min(b + size, self._surrounding.get_second(feature)))) 66 | 67 | def _create_temp_cube(self, cube: GenericCube, min_updates: Iterable[MinUpdate], 68 | hypercubes: Iterable[GenericCube], feature: str, 69 | direction: str) -> Iterable[Expansion]: 70 | temp_cube, values = self._create_range(cube, min_updates, feature, direction) 71 | temp_cube.update_dimension(feature, values) 72 | overlap = temp_cube.overlap(hypercubes) 73 | while (overlap is not None) & (temp_cube.has_volume()): 74 | overlap = ITER._resolve_overlap(temp_cube, overlap, hypercubes, feature, direction) 75 | if (temp_cube.has_volume() & (overlap is None)) & (all(temp_cube != cube for cube in hypercubes)): 76 | yield Expansion(temp_cube, feature, direction) 77 | else: 78 | cube.add_limit(feature, direction) 79 | 80 | def _create_temp_cubes(self, cube: GenericCube, min_updates: Iterable[MinUpdate], 81 | hypercubes: Iterable[GenericCube]) -> Iterable[Expansion]: 82 | tmp_cubes = [] 83 | for feature in self._surrounding.dimensions.keys(): 84 | if feature in self.ignore_dimensions: 85 | continue 86 | limit = cube.check_limits(feature) 87 | if limit == '*': 88 | continue 89 | for x in {'-', '+'} - {limit}: 90 | tmp_cubes += self._create_temp_cube(cube, min_updates, hypercubes, feature, x) 91 | return tmp_cubes 92 | 93 | def _cubes_to_update(self, dataframe: pd.DataFrame, to_expand: Iterable[GenericCube], 94 | hypercubes: Iterable[GenericCube], min_updates: Iterable[MinUpdate]) \ 95 | -> Iterable[tuple[GenericCube, Expansion]]: 96 | results = [(hypercube, self._best_cube(dataframe, hypercube, self._create_temp_cubes( 97 | hypercube, min_updates, hypercubes))) for hypercube in to_expand] 98 | return sorted([result for result in results if result[1] is not None], key=lambda x: x[1].distance) 99 | 100 | def _expand_or_create(self, cube: GenericCube, expansion: Expansion, hypercubes: Iterable[GenericCube]) -> None: 101 | if expansion.distance > self.threshold: 102 | hypercubes += [expansion.cube] 103 | else: 104 | cube.expand(expansion, hypercubes) 105 | 106 | @staticmethod 107 | def _find_closer_sample(dataframe: pd.DataFrame, output: float | str) -> dict[str, float]: 108 | if isinstance(output, str): 109 | close_sample = dataframe[dataframe.iloc[:, -1] == output].iloc[0].to_dict() 110 | else: 111 | difference = abs(dataframe.iloc[:, -1] - output) 112 | close_sample = dataframe[difference == min(difference)].iloc[0].to_dict() 113 | return close_sample 114 | 115 | def _generate_starting_points(self, dataframe: pd.DataFrame) -> Iterable[GenericCube]: 116 | if self.n_points <= 0: 117 | raise (Exception('InvalidAttributeValueException')) 118 | points: Iterable[float] 119 | if isinstance(dataframe.iloc[0, -1], str): 120 | classes = np.unique(dataframe.iloc[:, -1].values) 121 | points = [classes[i] for i in range(min(self.n_points, len(classes)))] 122 | else: 123 | desc = dataframe.iloc[:, -1].describe() 124 | min_output, max_output = desc["min"], desc["max"] 125 | points = [(max_output - min_output) / 2] if self.n_points == 1 else \ 126 | [min_output + (max_output - min_output) / (self.n_points - 1) * index for index in range(self.n_points)] 127 | return [HyperCube.cube_from_point(ITER._find_closer_sample(dataframe, point), output=self._output) 128 | for point in points] 129 | 130 | def _initialize(self, dataframe: pd.DataFrame) -> Iterable[MinUpdate]: 131 | self._fake_dataframe = dataframe.copy() 132 | self._surrounding = HyperCube.create_surrounding_cube(dataframe, output=self._output) 133 | min_updates = self._calculate_min_updates() 134 | self._init_hypercubes(dataframe, min_updates) 135 | for hypercube in self._hypercubes: 136 | hypercube.update(dataframe, self.predictor) 137 | return min_updates 138 | 139 | def _init_hypercubes(self, dataframe: pd.DataFrame, min_updates: Iterable[MinUpdate]): 140 | while True: 141 | hypercubes = self._generate_starting_points(dataframe) 142 | for hypercube in hypercubes: 143 | hypercube.expand_all(min_updates, self._surrounding) 144 | for d in self.ignore_dimensions: 145 | hypercube[d] = self._surrounding[d] 146 | self.n_points = self.n_points - 1 147 | if not HyperCube.check_overlap(hypercubes, hypercubes): 148 | break 149 | self._hypercubes = hypercubes 150 | 151 | def _iterate(self, dataframe: pd.DataFrame, hypercubes: Iterable[GenericCube], min_updates: Iterable[MinUpdate], 152 | left_iteration: int) -> int: 153 | np.random.seed(self.seed) 154 | iterations = 0 155 | to_expand = [cube for cube in hypercubes if cube.limit_count < (len(dataframe.columns) - 1) * 2] 156 | while (len(to_expand) > 0) and (iterations < left_iteration): 157 | updates = list(self._cubes_to_update(dataframe, to_expand, hypercubes, min_updates)) 158 | if len(updates) > 0: 159 | self._expand_or_create(updates[0][0], updates[0][1], hypercubes) 160 | iterations += 1 161 | to_expand = [cube for cube in hypercubes if cube.limit_count < (len(dataframe.columns) - 1) * 2] 162 | return iterations 163 | 164 | @staticmethod 165 | def _resolve_overlap(cube: GenericCube, overlapping_cube: GenericCube, hypercubes: Iterable[GenericCube], 166 | feature: str, direction: str) -> GenericCube: 167 | a, b = cube[feature] 168 | cube.update_dimension(feature, max(overlapping_cube.get_second(feature), a) if direction == '-' else a, 169 | min(overlapping_cube.get_first(feature), b) if direction == '+' else b) 170 | return cube.overlap(hypercubes) 171 | 172 | def _extract(self, dataframe: pd.DataFrame) -> Theory: 173 | min_updates = self._initialize(dataframe) 174 | temp_train = dataframe.copy() 175 | fake = dataframe.copy() 176 | iterations = 0 177 | while temp_train.shape[0] > 0: 178 | iterations += self._iterate(fake, self._hypercubes, min_updates, self.max_iterations - iterations) 179 | if (iterations >= self.max_iterations) or (not self.fill_gaps): 180 | break 181 | temp_train = temp_train.iloc[[p is None for p in self.predict(temp_train.iloc[:, :-1])]] 182 | if temp_train.shape[0] > 0: 183 | point, ratio, overlap, new_cube = temp_train.iloc[0].to_dict(), 1.0, True, None 184 | temp_train = temp_train.drop([temp_train.index[0]]) 185 | while overlap is not None: 186 | if new_cube is not None: 187 | if not new_cube.has_volume(): 188 | break 189 | new_cube = HyperCube.cube_from_point(point, self._output) 190 | new_cube.expand_all(min_updates, self._surrounding, ratio) 191 | overlap = new_cube.overlap(self._hypercubes) 192 | ratio *= 2 193 | if new_cube.has_volume(): 194 | self._hypercubes += [new_cube] 195 | return self._create_theory(dataframe) 196 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2021, Matteo Magnini and contributors listed in 190 | the CONTRIBUTORS file. 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | --------------------------------------------------------------------------------