├── .github
    ├── scripts
    │   └── retry.sh
    └── workflows
    │   ├── check.yml
    │   ├── deploy.yml
    │   ├── dockerify.yml
    │   └── todos.yml
├── .gitignore
├── .gitmodules
├── .img
    ├── logo-wide.png
    ├── logo-wide.svg
    ├── logo.png
    └── logo.svg
├── .python-version
├── CITATION.md
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── psyke
    ├── __init__.py
    ├── clustering
    │   ├── __init__.py
    │   ├── cream
    │   │   └── __init__.py
    │   ├── exact
    │   │   └── __init__.py
    │   └── utils.py
    ├── extraction
    │   ├── __init__.py
    │   ├── cart
    │   │   ├── CartPredictor.py
    │   │   ├── FairTree.py
    │   │   ├── FairTreePredictor.py
    │   │   └── __init__.py
    │   ├── hypercubic
    │   │   ├── __init__.py
    │   │   ├── cosmik
    │   │   │   └── __init__.py
    │   │   ├── creepy
    │   │   │   └── __init__.py
    │   │   ├── divine
    │   │   │   └── __init__.py
    │   │   ├── gridex
    │   │   │   └── __init__.py
    │   │   ├── gridrex
    │   │   │   └── __init__.py
    │   │   ├── hex
    │   │   │   └── __init__.py
    │   │   ├── hypercube.py
    │   │   ├── iter
    │   │   │   └── __init__.py
    │   │   ├── strategy.py
    │   │   └── utils.py
    │   ├── real
    │   │   ├── __init__.py
    │   │   └── utils.py
    │   └── trepan
    │   │   ├── __init__.py
    │   │   └── utils.py
    ├── hypercubepredictor.py
    ├── schema
    │   └── __init__.py
    ├── tuning
    │   ├── __init__.py
    │   ├── crash
    │   │   └── __init__.py
    │   ├── orchid
    │   │   └── __init__.py
    │   └── pedro
    │   │   └── __init__.py
    └── utils
    │   ├── __init__.py
    │   ├── dataframe.py
    │   ├── logic.py
    │   ├── metrics.py
    │   ├── plot.py
    │   └── sorted.py
├── pyproject.toml
├── renovate.json
├── requirements.txt
├── setup.py
└── test
    ├── __init__.py
    └── psyke
        ├── __init__.py
        ├── clustering
            └── __init__.py
        ├── extraction
            ├── __init__.py
            ├── cart
            │   ├── __init__.py
            │   ├── test_cart.py
            │   └── test_simplified_cart.py
            ├── hypercubic
            │   ├── __init__.py
            │   ├── gridex
            │   │   ├── __init__.py
            │   │   └── test_gridex.py
            │   ├── iter
            │   │   ├── __init__.py
            │   │   └── test_iter.py
            │   └── test_hypercube.py
            ├── real
            │   ├── __init__.py
            │   ├── test_real.py
            │   └── test_rule.py
            └── trepan
            │   ├── __init__.py
            │   ├── test_node.py
            │   ├── test_split.py
            │   └── test_trepan.py
        └── utils
            ├── __init__.py
            ├── test_prune.py
            ├── test_simplify.py
            └── test_simplify_formatter.py


/.github/scripts/retry.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DT=${2:-${RETRY_TIME:-5m}}
 4 | MAX=${3:-${MAX_RETRIES:-3}}
 5 | 
 6 | for N in `seq 1 $MAX`; do
 7 |     echo "Attempt $N/$MAX: $1"
 8 |     eval $1;
 9 |     RESULT=$?
10 |     if [[ $RESULT -eq 0 ]]; then
11 |         exit 0
12 |     fi
13 |     if [[ $N -lt $MAX ]]; then
14 |         echo "Failed attempt $N/$MAX. Waiting $DT"
15 |         sleep $DT
16 |     else
17 |         echo "Failed attempt $N/$MAX."
18 |         exit $RESULT
19 |     fi
20 | done
21 | 


--------------------------------------------------------------------------------
/.github/workflows/check.yml:
--------------------------------------------------------------------------------
 1 | name: check
 2 | on:
 3 |   push:
 4 |     tags: '*'
 5 |     branches-ignore:
 6 |       - 'autodelivery**'
 7 |       - 'bump-**'
 8 |       - 'dependabot/**'
 9 |     paths-ignore:
10 |       - 'CHANGELOG.md'
11 |       - 'renovate.json'
12 |       - '.gitignore'
13 |   pull_request:
14 |   workflow_dispatch:
15 | env:
16 |   PROJECT_NAME: psyke-python
17 |   WORKFLOW: check
18 |   TEST_SUBMODULE: psykei/psyke-pytest
19 | jobs:
20 |   create-test-predictors-if-needed:
21 |     runs-on: ubuntu-latest
22 |     name: Create test predictors if needed
23 |     # TODO: short circuit job as soon as it's possible:
24 |     # https://github.com/actions/runner/issues/662
25 |     # if: ${{ github.repository == 'psykei/psyke-python' }}
26 |     steps:
27 |       - name: Checkout code
28 |         if: ${{ github.repository == 'psykei/psyke-python' }}
29 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
30 |         with:
31 |           fetch-depth: 0
32 |           submodules: recursive
33 | 
34 |       - name: Get Python Version
35 |         if: ${{ github.repository == 'psykei/psyke-python' }}
36 |         id: get-python-version
37 |         run: echo ::set-output name=version::$(cat .python-version)
38 | 
39 |       - name: Setup Python
40 |         if: ${{ github.repository == 'psykei/psyke-python' }}
41 |         uses: actions/setup-python@v5
42 |         with:
43 |           python-version: ${{ steps.get-python-version.outputs.version }}
44 | 
45 |       - name: Restore Python dependencies
46 |         if: ${{ github.repository == 'psykei/psyke-python' }}
47 |         run: pip install -r requirements.txt
48 | 
49 |       # - name: Create missing predictors
50 |       #  if: ${{ github.repository == 'psykei/psyke-python' }}
51 |       #  run: python setup.py create_test_predictors
52 | 
53 |       - name: Submodule update
54 |         if: ${{ github.repository == 'psykei/psyke-python' }}
55 |         run: |
56 |           pushd test/resources
57 |           git config user.email "bot@noreply.github.com"
58 |           git config user.name "CI bot"
59 |           git remote set-url origin https://x-access-token:${{ secrets.TRIGGER_GITHUB_ACTION }}@github.com/${{ env.TEST_SUBMODULE }}
60 |           (git add predictors/*.onnx tests/*.csv datasets/*.csv) || echo 'nothing to add'
61 |           (git commit -m 'predictors update from workflows') || echo 'nothing to commit'
62 |           (git push) || echo 'nothing to push'
63 |   run-unit-tests:
64 |     strategy:
65 |       fail-fast: false
66 |       matrix:
67 |         os:
68 |           - ubuntu-latest
69 |           - windows-latest
70 |           # - macos-latest
71 |           - macos-13
72 |         python-version:
73 |           - '3.9.12'
74 |     runs-on: ${{ matrix.os }}
75 |     name: Run tests on Python ${{ matrix.python-version }}, on ${{ matrix.os }}
76 |     timeout-minutes: 45
77 |     concurrency:
78 |       group: ${{ github.workflow }}-run-unit-tests-${{ matrix.python-version }}-${{ matrix.os }}-${{ github.event.number || github.ref }}
79 |       cancel-in-progress: true
80 |     needs:
81 |       - create-test-predictors-if-needed
82 |     steps:
83 |       - name: Setup Python
84 |         uses: actions/setup-python@v5
85 |         with:
86 |           python-version: ${{ matrix.python-version }}
87 | 
88 |       - name: Checkout code
89 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
90 |         with:
91 |           fetch-depth: 0
92 |           submodules: recursive
93 | 
94 |       - name: Restore Python dependencies
95 |         run: pip install -r requirements.txt
96 | 
97 |       - name: Test
98 |         run: python -m unittest discover -s test -t .
99 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
 1 | name: deploy
 2 | on:
 3 |   workflow_run:
 4 |     workflows:
 5 |       - check
 6 |     types:
 7 |       - completed
 8 |     branches:
 9 |       - master
10 |       - develop
11 | env:
12 |   PROJECT_NAME: psyke-python
13 |   WORKFLOW: depoly
14 | jobs:
15 |   deploy:
16 |     runs-on: ubuntu-latest
17 |     if: ${{ github.event.workflow_run.conclusion == 'success' }}
18 |     name: Deploy on PyPI and create release
19 |     steps:
20 |       - name: Checkout code
21 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
22 |         with:
23 |           fetch-depth: 0 # all history
24 |           submodules: recursive
25 | 
26 |       - name: Get All Tags
27 |         run: git fetch --tags -f
28 | 
29 |       - name: Get Python Version
30 |         id: get-python-version
31 |         run: echo ::set-output name=version::$(cat .python-version)
32 | 
33 |       - name: Setup Python
34 |         uses: actions/setup-python@v5
35 |         with:
36 |           python-version: ${{ steps.get-python-version.outputs.version }}
37 | 
38 |       - name: Restore Python dependencies
39 |         run: |
40 |           pip install -r requirements.txt
41 | 
42 |       - name: Change default logging level
43 |         run: sed -i -e 's/DEBUG/WARN/g' psyke/__init__.py
44 | 
45 |       - name: Pack
46 |         run: python -m build
47 | 
48 |       - name: Archive Dist Artifacts
49 |         if: failure() || success()
50 |         uses: actions/upload-artifact@v4
51 |         with:
52 |           name: dist
53 |           path: './dist'
54 | 
55 |       - name: Upload
56 |         run: python -m twine upload dist/*
57 |         env:
58 |           TWINE_USERNAME: ${{ secrets.PYPI_USERANAME }}
59 |           TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
60 | 
61 |       - name: Get Version
62 |         id: get-version
63 |         run: echo ::set-output name=version::$(python setup.py get_project_version | tail -n 1)
64 | 
65 |       - name: Release Assets
66 |         id: upload-release-assets
67 |         run: |
68 |           set -x
69 |           ASSETS=()
70 |           for A in dist/*; do
71 |             ASSETS+=("-a" "$A")
72 |             echo "Releasing $A"
73 |           done
74 |           RELEASE_TAG='${{ steps.get-version.outputs.version }}'
75 |           gh release create "$RELEASE_TAG"
76 |         env:
77 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
78 | 


--------------------------------------------------------------------------------
/.github/workflows/dockerify.yml:
--------------------------------------------------------------------------------
 1 | name: dockerify
 2 | on:
 3 |   workflow_run:
 4 |     workflows:
 5 |       - deploy
 6 |     types:
 7 |       - completed
 8 |     branches:
 9 |       - master
10 |       - develop
11 | env:
12 |   PROJECT_NAME: psyke-python
13 |   WORKFLOW: dockerify
14 |   RETRY_TIME: 5m
15 |   MAX_RETRIES: 3
16 | jobs:
17 |   dockerify:
18 |     runs-on: ubuntu-latest
19 |     if: ${{ github.event.workflow_run.conclusion == 'success' }}
20 |     name: Dockerify with Jupyter support
21 |     steps:
22 |       - name: Docker Login
23 |         run: docker login -u ${{ secrets.DOCKERHUB_USERANAME }} -p ${{ secrets.DOCKERHUB_PASSWORD }}
24 | 
25 |       - name: Checkout code
26 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
27 |         with:
28 |           fetch-depth: 0 # all history
29 |           submodules: recursive
30 | 
31 |       - name: Get All Tags
32 |         run: git fetch --tags -f
33 | 
34 |       - name: Get Version
35 |         id: get-version
36 |         run: echo ::set-output name=version::$(python setup.py get_project_version | tail -n 1)
37 | 
38 |       - name: Create Docker Image
39 |         run: ./.github/scripts/retry.sh "docker build -t pikalab/psyke:$PSYKE_VERSION --build-arg PSYKE_VERSION=$PSYKE_VERSION ."
40 |         shell: bash
41 |         env:
42 |           PSYKE_VERSION: '${{ steps.get-version.outputs.version }}'
43 | 
44 |       - name: Push Image on Docker Hub
45 |         run: docker push pikalab/psyke:${{ steps.get-version.outputs.version }}
46 | 


--------------------------------------------------------------------------------
/.github/workflows/todos.yml:
--------------------------------------------------------------------------------
 1 | name: "TODOs finder"
 2 | on:
 3 |   push:
 4 |     branches-ignore:
 5 |       - 'autodelivery**'
 6 |       - 'bump-**'
 7 |       - 'renovate/**'
 8 |       - 'dependabot/**'
 9 | jobs:
10 |   build:
11 |     runs-on: "ubuntu-latest"
12 |     steps:
13 |       - uses: "actions/checkout@master"
14 |       - name: "TODO to Issue"
15 |         uses: "alstr/todo-to-issue-action@v5.1.12"
16 |         id: "todo"
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.toptal.com/developers/gitignore/api/python
  2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python
  3 | VERSION
  4 | 
  5 | .idea/
  6 | .vscode/
  7 | 
  8 | *~
  9 | *.jar
 10 | 
 11 | ### Python ###
 12 | # Byte-compiled / optimized / DLL files
 13 | __pycache__/
 14 | *.py[cod]
 15 | *$py.class
 16 | 
 17 | # C extensions
 18 | *.so
 19 | 
 20 | # Distribution / packaging
 21 | .Python
 22 | build/
 23 | develop-eggs/
 24 | dist/
 25 | downloads/
 26 | eggs/
 27 | .eggs/
 28 | lib/
 29 | lib64/
 30 | parts/
 31 | sdist/
 32 | var/
 33 | wheels/
 34 | pip-wheel-metadata/
 35 | share/python-wheels/
 36 | *.egg-info/
 37 | .installed.cfg
 38 | *.egg
 39 | MANIFEST
 40 | 
 41 | # PyInstaller
 42 | #  Usually these files are written by a python script from a template
 43 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 44 | *.manifest
 45 | *.spec
 46 | 
 47 | # Installer logs
 48 | pip-log.txt
 49 | pip-delete-this-directory.txt
 50 | 
 51 | # Unit test / coverage reports
 52 | htmlcov/
 53 | .tox/
 54 | .nox/
 55 | .coverage
 56 | .coverage.*
 57 | .cache
 58 | nosetests.xml
 59 | coverage.xml
 60 | *.cover
 61 | *.py,cover
 62 | .hypothesis/
 63 | .pytest_cache/
 64 | pytestdebug.log
 65 | 
 66 | # Translations
 67 | *.mo
 68 | *.pot
 69 | 
 70 | # Django stuff:
 71 | *.log
 72 | local_settings.py
 73 | db.sqlite3
 74 | db.sqlite3-journal
 75 | 
 76 | # Flask stuff:
 77 | instance/
 78 | .webassets-cache
 79 | 
 80 | # Scrapy stuff:
 81 | .scrapy
 82 | 
 83 | # Sphinx documentation
 84 | docs/_build/
 85 | doc/_build/
 86 | 
 87 | # PyBuilder
 88 | target/
 89 | 
 90 | # Jupyter Notebook
 91 | .ipynb_checkpoints
 92 | 
 93 | # IPython
 94 | profile_default/
 95 | ipython_config.py
 96 | 
 97 | # pyenv
 98 | # .python-version
 99 | 
100 | # pipenv
101 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
102 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
103 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
104 | #   install all needed dependencies.
105 | #Pipfile.lock
106 | 
107 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
108 | __pypackages__/
109 | 
110 | # Celery stuff
111 | celerybeat-schedule
112 | celerybeat.pid
113 | 
114 | # SageMath parsed files
115 | *.sage.py
116 | 
117 | # Environments
118 | .env
119 | .venv
120 | env/
121 | venv/
122 | ENV/
123 | env.bak/
124 | venv.bak/
125 | pythonenv*
126 | 
127 | # Spyder project settings
128 | .spyderproject
129 | .spyproject
130 | 
131 | # Rope project settings
132 | .ropeproject
133 | 
134 | # mkdocs documentation
135 | /site
136 | 
137 | # mypy
138 | .mypy_cache/
139 | .dmypy.json
140 | dmypy.json
141 | 
142 | # Pyre type checker
143 | .pyre/
144 | 
145 | # pytype static type analyzer
146 | .pytype/
147 | 
148 | # profiling data
149 | .prof
150 | 
151 | # End of https://www.toptal.com/developers/gitignore/api/python
152 | 
153 | # macOS stuff
154 | .DS_store
155 | 
156 | # File ONNX
157 | *.onnx
158 | 
159 | # Local stuff
160 | dummy/
161 | tmp_model/
162 | plots/
163 | demo/
164 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "test/resources"]
2 | 	path = test/resources
3 | 	url = https://github.com/psykei/psyke-pytest.git
4 | 


--------------------------------------------------------------------------------
/.img/logo-wide.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/psykei/psyke-python/1e4ce34679d34abde6f7545a070aab99d3b053bb/.img/logo-wide.png


--------------------------------------------------------------------------------
/.img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/psykei/psyke-python/1e4ce34679d34abde6f7545a070aab99d3b053bb/.img/logo.png


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.9.23
2 | 


--------------------------------------------------------------------------------
/CITATION.md:
--------------------------------------------------------------------------------
 1 | To cite PSyKE in publications, please use:
 2 | 
 3 | > Federico Sabbatini, Giovanni Ciatto, Roberta Calegari, Andrea Omicini. "[On the Design of PSyKE: A Platform for Symbolic Knowledge Extraction](http://ceur-ws.org/Vol-2963/paper14.pdf)", in: WOA 2021 – 22nd Workshop “From Objects to Agents”, Aachen, Sun SITE Central Europe, RWTH Aachen University, 2021, 2963, pp. 29 - 48.
 4 | 
 5 | A BibTeX entry for LaTeX users is:
 6 | ```bibtex
 7 | @inproceedings{psyke-woa2021,
 8 | 	articleno = 3,
 9 | 	author = {Sabbatini, Federico and Ciatto, Giovanni and Calegari, Roberta and Omicini, Andrea},
10 | 	booktitle = {WOA 2021 -- 22nd Workshop ``From Objects to Agents''},
11 | 	editor = {Calegari, Roberta and Ciatto, Giovanni and Denti, Enrico and Omicini, Andrea and Sartor, Giovanni},
12 | 	issn = {1613-0073},
13 | 	keywords = {explainable AI, knowledge extraction, interpretable prediction, PSyKE},
14 | 	location = {Bologna, Italy},
15 | 	month = oct,
16 | 	note = {22nd Workshop ``From Objects to Agents'' (WOA 2021), Bologna, Italy, 1--3~} # sep # {~2021. Proceedings},
17 | 	numpages = 20,
18 | 	pages = {29--48},
19 | 	publisher = {Sun SITE Central Europe, RWTH Aachen University},
20 | 	series = {CEUR Workshop Proceedings},
21 | 	subseries = {AI*IA Series},
22 | 	title = {On the Design of {PSyKE}: A Platform for Symbolic Knowledge Extraction},
23 | 	url = {http://ceur-ws.org/Vol-2963/paper14.pdf},
24 | 	volume = 2963,
25 | 	year = 2021
26 | }
27 | ```
28 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9
 2 | ARG PSYKE_VERSION
 3 | EXPOSE 8888
 4 | RUN apt update; apt install -y -q openjdk-17-jdk
 5 | RUN pip install jupyter
 6 | RUN pip install psyke==$PSYKE_VERSION
 7 | RUN mkdir -p /root/.jupyter
 8 | ENV JUPYTER_CONF_FILE /root/.jupyter/jupyter_notebook_config.py
 9 | RUN echo "c.NotebookApp.allow_origin = '*'" > $JUPYTER_CONF_FILE
10 | RUN echo "c.NotebookApp.ip = '0.0.0.0'" >> $JUPYTER_CONF_FILE
11 | RUN mkdir -p /notebook
12 | COPY test/resources/datasets/*.csv /notebook/datasets/
13 | WORKDIR /notebook
14 | CMD jupyter notebook --allow-root --no-browser
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2021 PIKA-lab / eXplanable AI
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include VERSION
2 | exclude test/*
3 | exclude demo/*
4 | exclude main.py
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PSyKE
  2 | 
  3 | ![PSyKE Logo](.img/logo-wide.png)
  4 | 
  5 | Some quick links:
  6 | * [Home Page](https://apice.unibo.it/xwiki/bin/view/PSyKE/)
  7 | * [GitHub Repository](https://github.com/psykei/psyke-python)
  8 | * [PyPi Repository](https://pypi.org/project/psyke/)
  9 | * [Issues](https://github.com/psykei/psyke-python/issues)
 10 | 
 11 | ## Intro
 12 | 
 13 | [PSyKE](https://apice.unibo.it/xwiki/bin/view/PSyKE/) (Platform for Symbolic Knowledge Extraction)
 14 | is intended as a library for extracting symbolic knowledge (in the form of logic rules) out of sub-symbolic predictors.
 15 | 
 16 | More precisely, PSyKE offers a general purpose API for knowledge extraction, and a number of different algorithms implementing it,
 17 | supporting both classification and regression problems.
 18 | The extracted knowledge consists of a Prolog theory (i.e., a list of Horn clauses) or an OWL ontology containing SWRL rules.
 19 | 
 20 | PSyKE relies on [2ppy](https://github.com/tuProlog/2ppy) (tuProlog in Python) for logic support, which in turn is based on the [2p-Kt](https://github.com/tuProlog/2p-kt) logic ecosystem.
 21 | 
 22 | ### Class diagram overview:
 23 | 
 24 | ![PSyKE class diagram](http://www.plantuml.com/plantuml/svg/PLBBRkem4DtdAqQixeLcqsN40aHfLQch2dM341gS0IpoY3oJYfJctnl7RkgcKZRdCUFZ4ozOq4YTPr65we8dWlkgQcuHmEPCfMbW6iDaEe5LXZLJr4QHof3PgxVMGoTtS5XJSNCXkwVxlhdUguzQeUYoi28u3bxNovS0RWnLM7H46mNZXaw6c4UZpq8cW4z6ftGTZoeq4WwjB6x7BbPdoZ7qFMXMXeGU2QKsv2I06HmTiIymfmHOpA1WccjcVSXe_uvPJPn0gfLiEyyTl5bcrtk7qzTNCQYaDBxhyQ6_BFFFEExJ_sLzXoFMLpdcVMrZrhVNvS83zygFmrv-1fMXL5lOezH5rH_z7qqWqonRbn-72-nwAxaz_r8KP9B_YNz3uTP0jFcmAt6xB9gT3UJSC8_Z87G2PIrLBL0UemKLQPrdNm00)
 25 | 
 26 | <!--
 27 | To generate/edit the class diagram browse the URL above, after replacing `svg` with `uml`
 28 | -->
 29 | 
 30 | PSyKE is designed around the notion of _extractor_.
 31 | More precisely, an `Extractor` is any object capable of extracting a logic `Theory` out of a trained sub-symbolic regressor or classifier.
 32 | Accordingly, an `Extractor` is composed of 
 33 | _(i)_ a trained predictor (i.e., black-box used as an oracle) and 
 34 | _(ii)_ a set of feature descriptors, and it provides two methods:
 35 | * `extract`: returns a logic theory given a dataset;
 36 | * `predict`: predicts a value using the extracted rules (instead of the original predictor).
 37 | 
 38 | Currently, the supported extraction algorithms are:
 39 | * [CART](https://doi.org/10.1201/9781315139470),
 40 | straightforward extracts rules from both classification and regression decision trees;
 41 | * Classification:
 42 |   * [REAL](http://dx.doi.org/10.1016/B978-1-55860-335-6.50013-1) (Rule Extraction As Learning),
 43 |   generates and generalizes rules strarting from dataset samples;
 44 |   * [Trepan](http://dx.doi.org/10.1016/B978-1-55860-335-6.50013-1),
 45 |   generates rules by inducing a decision tree and possibly exploiting m-of-n expressions;
 46 | * Regression:
 47 |   * [ITER](http://dx.doi.org/10.1007/11823728_26),
 48 |   builds and iteratively expands hypercubes in the input space.
 49 |   Each cube holds a constant value, that is the estimated output for the samples inside the cube;
 50 |   * [GridEx](http://dx.doi.org/10.1007/978-3-030-82017-6_2),
 51 |   extension of the ITER algorithm that produces shorter rule lists retaining higher fidelity w.r.t. the predictor.
 52 |   * GridREx,
 53 |   extension of GridEx where the output of each hypercube is a linear combination of the input variables and not a constant value.
 54 |   
 55 | Users may exploit the PEDRO algorithm, included in PSyKE, to tune the optimal values for GridEx and GridREx hyper-parameters.
 56 | 
 57 | We are working on PSyKE to extend its features to encompass explainable clustering tasks, as well as to make more general-purpose the supported extraction algorithms (e.g., by adding classification support to GridEx and GridREx).
 58 | 
 59 | ## Users
 60 | 
 61 | ### End users
 62 | 
 63 | PSyKE is deployed as a library on Pypi, and it can therefore be installed as Python package by running:
 64 | ```bash
 65 | pip install psyke
 66 | ```
 67 | 
 68 | #### Requirements
 69 | * `numpy`
 70 | * `pandas`
 71 | * `scikit-learn`
 72 | * `2ppy`
 73 | 
 74 | ##### Test requirements
 75 | * `skl2onnx`
 76 | * `onnxruntime`
 77 | * `parameterized`
 78 | 
 79 | Once installed, it is possible to create an extractor from a predictor 
 80 | (e.g. Neural Network, Support Vector Machine, K-Nearest Neighbor, Random Forest, etc.)
 81 | and from the dataset used to train the predictor.
 82 | 
 83 | > **Note:** the predictor must expose a method named `predict` to be properly used as an oracle.
 84 | 
 85 | #### End users
 86 | 
 87 | A brief example is presented in `demo.py` script in the `demo/` folder.
 88 | Using `sklearn`'s Iris dataset we train a K-Nearest Neighbor to predict the correct output class.
 89 | Before training, we make the dataset discrete.
 90 | After that we create two different extractors: REAL and Trepan.
 91 | We output the extracted theory for both extractors.
 92 | 
 93 | REAL extracted rules:
 94 | ```
 95 | iris(PetalLength, PetalWidth, SepalLength, SepalWidth, setosa) :- PetalWidth =< 1.0.
 96 | iris(PetalLength1, PetalWidth1, SepalLength1, SepalWidth1, versicolor) :- PetalLength1 > 4.9, SepalWidth1 in [2.9, 3.2].
 97 | iris(PetalLength2, PetalWidth2, SepalLength2, SepalWidth2, versicolor) :- PetalWidth2 > 1.6.
 98 | iris(PetalLength3, PetalWidth3, SepalLength3, SepalWidth3, virginica) :- SepalWidth3 =< 2.9.
 99 | iris(PetalLength4, PetalWidth4, SepalLength4, SepalWidth4, virginica) :- SepalLength4 in [5.4, 6.3].
100 | iris(PetalLength5, PetalWidth5, SepalLength5, SepalWidth5, virginica) :- PetalWidth5 in [1.0, 1.6].
101 | ```
102 | 
103 | Trepan extracted rules:
104 | ```
105 | iris(PetalLength6, PetalWidth6, SepalLength6, SepalWidth6, virginica) :- PetalLength6 > 3.0, PetalLength6 in [3.0, 4.9].
106 | iris(PetalLength7, PetalWidth7, SepalLength7, SepalWidth7, versicolor) :- PetalLength7 > 3.0.
107 | iris(PetalLength8, PetalWidth8, SepalLength8, SepalWidth8, setosa) :- true.
108 | ```
109 | 
110 | 
111 | ## Developers
112 | 
113 | Working with PSyKE codebase requires a number of tools to be installed:
114 | * Python 3.9
115 |   + Python version greater than `3.9.x` are currently __not__ supported
116 | 
117 | * JDK 11+ (please ensure the `JAVA_HOME` environment variable is properly configured)
118 | * Git 2.20+
119 | 
120 | ### Develop PSyKE with PyCharm
121 | 
122 | To participate in the development of PSyKE, we suggest the [PyCharm](https://www.jetbrains.com/pycharm/) IDE.
123 | 
124 | #### Importing the project
125 | 
126 | 1. Clone this repository in a folder of your preference using `git_clone` appropriately
127 | 2. Open PyCharm
128 | 3. Select `Open`
129 | 4. Navigate your file system and find the folder where you cloned the repository
130 | 5. Click `Open`
131 | 
132 | ### Developing the project
133 | 
134 | Contributions to this project are welcome. Just some rules:
135 | * We use [git flow](https://github.com/nvie/gitflow), so if you write new features, please do so in a separate `feature/` branch
136 | * We recommend forking the project, developing your code, then contributing back via pull request
137 | * Commit often
138 | * Stay in sync with the `develop` (or `master`) branch (pull frequently if the build passes)
139 | * Do not introduce low quality or untested code
140 | 
141 | #### Issue tracking
142 | If you meet some problems in using or developing PSyKE, you are encouraged to signal it through the project
143 | ["Issues" section](https://github.com/psykei/psyke-python/issues) on GitHub.
144 | 


--------------------------------------------------------------------------------
/psyke/clustering/__init__.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from typing import Iterable
 3 | 
 4 | from psyke import Clustering, Target
 5 | from psyke.extraction.hypercubic import HyperCube
 6 | from psyke.hypercubepredictor import HyperCubePredictor
 7 | 
 8 | 
 9 | class HyperCubeClustering(HyperCubePredictor, Clustering, ABC):
10 | 
11 |     def __init__(self, output: Target = Target.CONSTANT, discretization=None, normalization=None):
12 |         HyperCubePredictor.__init__(self, output=output, discretization=discretization, normalization=normalization)
13 | 
14 |     def get_hypercubes(self) -> Iterable[HyperCube]:
15 |         raise NotImplementedError('get_hypercubes')
16 | 


--------------------------------------------------------------------------------
/psyke/clustering/cream/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Iterable
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | 
 8 | from psyke.utils import Target, get_default_random_seed
 9 | from psyke.clustering.exact import ExACT
10 | from psyke.extraction.hypercubic import Node, HyperCube, ClosedCube
11 | from psyke.clustering.utils import select_gaussian_mixture
12 | 
13 | 
14 | class CREAM(ExACT):
15 |     """
16 |     Explanator implementing CREAM algorithm.
17 |     """
18 | 
19 |     def __init__(self, depth: int, error_threshold: float, output: Target = Target.CONSTANT, gauss_components: int = 5,
20 |                  discretization=None, normalization=None, seed: int = get_default_random_seed()):
21 |         super().__init__(depth, error_threshold, output, gauss_components, discretization, normalization, seed)
22 | 
23 |     def __eligible_cubes(self, gauss_pred: np.ndarray, node: Node, clusters: int):
24 |         cubes = []
25 |         for i in range(len(np.unique(gauss_pred))):
26 |             df = node.dataframe.iloc[np.where(gauss_pred == i)]
27 |             if len(df) == 0:
28 |                 continue
29 |             inner_cube = self._create_cube(df, clusters)
30 |             indices = self._indices(inner_cube, node.dataframe)
31 |             if indices is None:
32 |                 continue
33 |             right, left = self._split(inner_cube, node.cube, node.dataframe, indices)
34 |             cubes.append((
35 |                 ((right.diversity + left.diversity) / 2, right.volume(), left.volume(), i),
36 |                 (right, indices), (left, ~indices)
37 |             ))
38 |         return cubes
39 | 
40 |     def _split(self, right: ClosedCube, outer_cube: ClosedCube, data: pd.DataFrame, indices: np.ndarray):
41 |         right.update(data.iloc[indices], self._predictor)
42 |         left = outer_cube.copy()
43 |         left.update(data.iloc[~indices], self._predictor)
44 |         return right, left
45 | 
46 |     def _iterate(self, surrounding: Node) -> Iterable[HyperCube]:
47 |         to_split = [(self.error_threshold * 10, 1, 1, surrounding)]
48 |         while len(to_split) > 0:
49 |             to_split.sort(reverse=True)
50 |             (_, depth, _, node) = to_split.pop()
51 |             data = ExACT._remove_string_label(node.dataframe)
52 |             gauss_params = select_gaussian_mixture(data, self.gauss_components)
53 |             gauss_pred = gauss_params[2].predict(data)
54 |             cubes = self.__eligible_cubes(gauss_pred, node, gauss_params[1])
55 |             if len(cubes) < 1:
56 |                 continue
57 |             _, right, left = min(cubes)
58 |             # find_better_constraints(node.dataframe[right[1]], right[0])
59 |             node.right = Node(node.dataframe[right[1]], right[0])
60 |             node.cube.update(node.dataframe[left[1]], self._predictor)
61 |             node.left = Node(node.dataframe[left[1]], left[0])
62 | 
63 |             if depth < self.depth:
64 |                 to_split += [
65 |                     (error, depth + 1, np.random.uniform(), n) for (n, error) in
66 |                     zip(node.children, [right[0].diversity, left[0].diversity]) if error > self.error_threshold
67 |                 ]
68 |         return self._node_to_cubes(surrounding)


--------------------------------------------------------------------------------
/psyke/clustering/exact/__init__.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from abc import ABC
  4 | from collections import Counter
  5 | from typing import Iterable, Union
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | from sklearn.cluster import DBSCAN
 10 | from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
 11 | 
 12 | from psyke.clustering import HyperCubeClustering
 13 | from psyke.extraction.hypercubic import Node, ClosedCube, HyperCube
 14 | from psyke.clustering.utils import select_gaussian_mixture, select_dbscan_epsilon
 15 | from psyke.extraction.hypercubic.hypercube import ClosedRegressionCube, ClosedClassificationCube
 16 | from psyke.utils import Target, get_default_random_seed
 17 | 
 18 | 
 19 | class ExACT(HyperCubeClustering, ABC):
 20 |     """
 21 |     Explanator implementing ExACT algorithm.
 22 |     """
 23 | 
 24 |     def __init__(self, depth: int = 2, error_threshold: float = 0.1, output: Target = Target.CONSTANT,
 25 |                  gauss_components: int = 2, discretization=None, normalization=None,
 26 |                  seed: int = get_default_random_seed()):
 27 |         super().__init__(output, discretization, normalization)
 28 |         self.depth = depth
 29 |         self.error_threshold = error_threshold
 30 |         self.gauss_components = gauss_components
 31 |         self._predictor = KNeighborsClassifier() if output == Target.CLASSIFICATION else KNeighborsRegressor()
 32 |         self._predictor.n_neighbors = 1
 33 |         self.seed = seed
 34 | 
 35 |     def __eligible_cubes(self, gauss_pred: np.ndarray, node: Node, clusters: int):
 36 |         cubes = []
 37 |         for i in range(len(np.unique(gauss_pred))):
 38 |             df = node.dataframe.iloc[np.where(gauss_pred == i)]
 39 |             if len(df) == 0:
 40 |                 continue
 41 |             cubes.append(self._create_cube(df, clusters))
 42 |         indices = [self._indices(cube, node.dataframe) for cube in cubes]
 43 |         return cubes, indices
 44 | 
 45 |     @staticmethod
 46 |     def _indices(cube: ClosedCube, data: pd.DataFrame) -> np.ndarray | None:
 47 |         indices = cube.filter_indices(data.iloc[:, :-1])
 48 |         if len(data.iloc[indices]) * len(data.iloc[~indices]) == 0:
 49 |             return None
 50 |         return indices
 51 | 
 52 |     def _create_cube(self, dataframe: pd.DataFrame, clusters: int) -> ClosedCube:
 53 |         data = ExACT._remove_string_label(dataframe)
 54 |         dbscan_pred = DBSCAN(eps=select_dbscan_epsilon(data, clusters)).fit_predict(data.iloc[:, :-1])
 55 |         return HyperCube.create_surrounding_cube(
 56 |             dataframe.iloc[np.where(dbscan_pred == Counter(dbscan_pred).most_common(1)[0][0])],
 57 |             True, self._output
 58 |         )
 59 | 
 60 |     def fit(self, dataframe: pd.DataFrame):
 61 |         np.random.seed(self.seed)
 62 |         self._predictor.fit(dataframe.iloc[:, :-1], dataframe.iloc[:, -1])
 63 |         self._surrounding = HyperCube.create_surrounding_cube(dataframe, True, self._output)
 64 |         self._hypercubes = self._iterate(Node(dataframe, self._surrounding))
 65 | 
 66 |     def get_hypercubes(self) -> Iterable[HyperCube]:
 67 |         return list(self._hypercubes)
 68 | 
 69 |     def explain(self):
 70 |         for cube in self._hypercubes:
 71 |             print(f'Output is {cube.output} if:')
 72 |             for feature in cube.dimensions:
 73 |                 lower, upper = cube[feature]
 74 |                 print(f'    {feature} is in [{lower:.2f}, {upper:.2f}]')
 75 | 
 76 |     @staticmethod
 77 |     def _remove_string_label(dataframe: pd.DataFrame):
 78 |         return dataframe.replace({dataframe.columns[-1]: {v: k for k, v in dict(
 79 |             enumerate(dataframe.iloc[:, -1].unique())
 80 |         ).items()}}) if isinstance(dataframe.iloc[0, -1], str) else dataframe
 81 | 
 82 |     def _iterate(self, surrounding: Node) -> Iterable[HyperCube]:
 83 |         to_split = [(self.error_threshold * 10, 1, 1, surrounding)]
 84 |         while len(to_split) > 0:
 85 |             to_split.sort(reverse=True)
 86 |             (_, depth, _, node) = to_split.pop()
 87 |             data = ExACT._remove_string_label(node.dataframe)
 88 |             gauss_params = select_gaussian_mixture(data, self.gauss_components)
 89 |             gauss_pred = gauss_params[2].predict(data)
 90 |             cubes, indices = self.__eligible_cubes(gauss_pred, node, gauss_params[1])
 91 |             cubes = [(c.volume(), len(idx), i, idx, c) for i, (c, idx) in enumerate(zip(cubes, indices))
 92 |                      if (idx is not None) and (not node.cube.equal(c))]
 93 |             if len(cubes) < 1:
 94 |                 continue
 95 |             _, _, _, indices, cube = max(cubes)
 96 | 
 97 |             cube.update(node.dataframe[indices], self._predictor)
 98 |             node.right = Node(node.dataframe[indices], cube)
 99 |             node.cube.update(node.dataframe[~indices], self._predictor)
100 |             node.left = Node(node.dataframe[~indices], node.cube)
101 | 
102 |             if depth < self.depth and cube.diversity > self.error_threshold:
103 |                 to_split.append((cube.diversity, depth + 1, np.random.uniform(), node.right))
104 |         return self._node_to_cubes(surrounding)
105 | 
106 |     def _node_to_cubes(self, root: Node) -> list[ClosedCube]:
107 |         if root.right is None:
108 |             return [root.cube]
109 |         else:
110 |             return self._node_to_cubes(root.right) + self._node_to_cubes(root.left)
111 | 
112 |     def _default_cube(self) -> Union[ClosedCube, ClosedRegressionCube, ClosedClassificationCube]:
113 |         if self._output == Target.CONSTANT:
114 |             return ClosedCube()
115 |         if self._output == Target.REGRESSION:
116 |             return ClosedRegressionCube()
117 |         return ClosedClassificationCube()
118 | 


--------------------------------------------------------------------------------
/psyke/clustering/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from kneed import KneeLocator
 4 | from sklearn.cluster import DBSCAN
 5 | from sklearn.mixture import GaussianMixture
 6 | from sklearn.neighbors import NearestNeighbors
 7 | 
 8 | 
 9 | def select_gaussian_mixture(data: pd.DataFrame, max_components) -> tuple[float, int, GaussianMixture]:
10 |     components = range(2, max_components + 1)
11 |     try:
12 |         models = [GaussianMixture(n_components=n).fit(data) for n in components if n <= len(data)]
13 |     except ValueError:
14 |         print(data)
15 |         print(len(data))
16 |     return min([(m.bic(data) / (i + 2), (i + 2), m) for i, m in enumerate(models)])
17 | 
18 | 
19 | def select_dbscan_epsilon(data: pd.DataFrame, clusters: int) -> float:
20 |     neighbors = NearestNeighbors(n_neighbors=min(len(data.columns) * 2, len(data))).fit(data)
21 |     distances = sorted(np.mean(neighbors.kneighbors(data)[1], axis=1), reverse=True)
22 |     try:
23 |         kn = KneeLocator([d for d in range(len(distances))], distances,
24 |                          curve='convex', direction='decreasing', online=True)
25 |         if kn.knee is None or kn.knee_y is None:
26 |             epsilon = max(distances[-1], 1e-3)
27 |         else:
28 |             epsilon = kn.knee_y
29 |     except (RuntimeWarning, UserWarning, ValueError):
30 |         epsilon = max(distances[-1], 1e-3)
31 |     k = 1.
32 |     dbscan_pred = DBSCAN(eps=epsilon * k).fit_predict(data.iloc[:, :-1])
33 |     # while Counter(dbscan_pred).most_common(1)[0][0] == -1:
34 |     for i in range(1000):
35 |         if len(np.unique(dbscan_pred)) < clusters + 1:
36 |             break
37 |         k += .1
38 |         dbscan_pred = DBSCAN(eps=epsilon * k).fit_predict(data.iloc[:, :-1])
39 |     return epsilon * k
40 | 


--------------------------------------------------------------------------------
/psyke/extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from collections import Iterable
 3 | 
 4 | import pandas as pd
 5 | from tuprolog.theory import Theory
 6 | 
 7 | from psyke import Extractor
 8 | 
 9 | 
10 | class PedagogicalExtractor(Extractor, ABC):
11 | 
12 |     def __init__(self, predictor, discretization=None, normalization=None):
13 |         Extractor.__init__(self, predictor=predictor, discretization=discretization, normalization=normalization)
14 | 
15 |     def _substitute_output(self, dataframe: pd.DataFrame) -> pd.DataFrame:
16 |         new_y = pd.DataFrame(self.predictor.predict(dataframe.iloc[:, :-1])).set_index(dataframe.index)
17 |         data = dataframe.iloc[:, :-1].copy().join(new_y)
18 |         data.columns = dataframe.columns
19 |         return data
20 | 
21 |     def extract(self, dataframe: pd.DataFrame) -> Theory:
22 |         self.theory = self._extract(self._substitute_output(dataframe))
23 |         return self.theory
24 | 
25 |     def _extract(self, dataframe: pd.DataFrame) -> Theory:
26 |         raise NotImplementedError('extract')
27 | 


--------------------------------------------------------------------------------
/psyke/extraction/cart/CartPredictor.py:
--------------------------------------------------------------------------------
  1 | from collections import Iterable
  2 | from typing import Union, Any
  3 | import numpy as np
  4 | import pandas as pd
  5 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
  6 | from tuprolog.core import clause, Var, Struct
  7 | from tuprolog.theory import Theory, mutable_theory
  8 | 
  9 | from psyke.extraction.cart import LeafConstraints, LeafSequence
 10 | from psyke.schema import LessThan, GreaterThan, SchemaException, DiscreteFeature
 11 | from psyke.utils.logic import create_variable_list, create_head, create_term
 12 | 
 13 | 
 14 | class CartPredictor:
 15 |     """
 16 |     A wrapper for decision and regression trees of sklearn.
 17 |     """
 18 | 
 19 |     def __init__(self, predictor: Union[DecisionTreeClassifier, DecisionTreeRegressor] = DecisionTreeClassifier(),
 20 |                  discretization=None, normalization=None):
 21 |         self._predictor = predictor
 22 |         self.discretization = discretization
 23 |         self.normalization = normalization
 24 | 
 25 |     def __get_constraints(self, nodes: Iterable[(int, bool)]) -> LeafConstraints:
 26 |         thresholds = [self._predictor.tree_.threshold[i[0]] for i in nodes]
 27 |         features = [self._predictor.feature_names_in_[self._predictor.tree_.feature[node[0]]] for node in nodes]
 28 |         conditions = [node[1] for node in nodes]
 29 |         if self.normalization is not None:
 30 |             thresholds = [threshold * self.normalization[feature][1] + self.normalization[feature][0]
 31 |                           for feature, threshold in zip(features, thresholds)]
 32 |         cond_dict = {}
 33 |         for feature, condition, threshold in zip(features, conditions, thresholds):
 34 |             cond = LessThan(threshold) if condition else GreaterThan(threshold)
 35 |             if feature in cond_dict:
 36 |                 try:
 37 |                     cond_dict[feature][-1] *= cond
 38 |                 except SchemaException:
 39 |                     cond_dict[feature].append(cond)
 40 |             else:
 41 |                 cond_dict[feature] = [cond]
 42 |         return cond_dict
 43 | 
 44 |     def __get_leaves(self) -> Iterable[int]:
 45 |         return [i for i, (left_child, right_child) in enumerate(zip(
 46 |             self._left_children, self._right_children
 47 |         )) if left_child == -1 and right_child == -1]
 48 | 
 49 |     def __get_prediction(self, node: int) -> Any:
 50 |         if hasattr(self._predictor, 'classes_'):
 51 |             return self._predictor.classes_[np.argmax(self._predictor.tree_.value[node])]
 52 |         else:
 53 |             return self._predictor.tree_.value[node]
 54 | 
 55 |     def __path(self, node: int, path=None) -> Iterable[(int, bool)]:
 56 |         path = [] if path is None else path
 57 |         if node == 0:
 58 |             return path
 59 |         father = list(self._left_children if node in self._left_children else self._right_children).index(node)
 60 |         return self.__path(father, [(father, node in self._left_children)] + path)
 61 | 
 62 |     def __iter__(self) -> LeafSequence:
 63 |         leaves = self.__get_leaves()
 64 |         return ((self.__get_constraints(self.__path(i)), self.__get_prediction(i)) for i in leaves)
 65 | 
 66 |     def predict(self, data) -> Iterable:
 67 |         return self._predictor.predict(data)
 68 | 
 69 |     @staticmethod
 70 |     def _simplify_nodes(nodes: list) -> Iterable:
 71 |         simplified = [nodes.pop(0)]
 72 |         while len(nodes) > 0:
 73 |             first_node = nodes[0][0]
 74 |             for k, conditions in first_node.items():
 75 |                 for condition in conditions:
 76 |                     if all(k in node[0] and condition in node[0][k] for node in nodes):
 77 |                         [node[0][k].remove(condition) for node in nodes]
 78 |             simplified.append(nodes.pop(0))
 79 |         return [({k: v for k, v in rule.items() if v != []}, prediction) for rule, prediction in simplified]
 80 | 
 81 |     def _create_body(self, variables: dict[str, Var], conditions: LeafConstraints) -> Iterable[Struct]:
 82 |         results = []
 83 |         for feature_name, cond_list in conditions.items():
 84 |             for condition in cond_list:
 85 |                 feature: DiscreteFeature = [d for d in self.discretization if feature_name in d.admissible_values][0] \
 86 |                     if self.discretization else None
 87 |                 results.append(create_term(variables[feature_name], condition) if feature is None else
 88 |                                create_term(variables[feature.name],
 89 |                                            feature.admissible_values[feature_name],
 90 |                                            isinstance(condition, GreaterThan)))
 91 |         return results
 92 | 
 93 |     def create_theory(self, data: pd.DataFrame, simplify: True) -> Theory:
 94 |         new_theory = mutable_theory()
 95 |         nodes = [node for node in self]
 96 |         nodes = self._simplify_nodes(nodes) if simplify else nodes
 97 |         for (constraints, prediction) in nodes:
 98 |             if self.normalization is not None and data.columns[-1] in self.normalization:
 99 |                 m, s = self.normalization[data.columns[-1]]
100 |                 prediction = prediction * s + m
101 |             variables = create_variable_list(self.discretization, data)
102 |             new_theory.assertZ(
103 |                 clause(
104 |                     create_head(data.columns[-1], list(variables.values()), prediction),
105 |                     self._create_body(variables, constraints)
106 |                 )
107 |             )
108 |         return new_theory
109 | 
110 |     @property
111 |     def predictor(self) -> Union[DecisionTreeClassifier, DecisionTreeRegressor]:
112 |         return self._predictor
113 | 
114 |     @property
115 |     def n_leaves(self) -> int:
116 |         return len(list(self.__get_leaves()))
117 | 
118 |     @property
119 |     def _left_children(self) -> list[int]:
120 |         return self._predictor.tree_.children_left
121 | 
122 |     @property
123 |     def _right_children(self) -> list[int]:
124 |         return self._predictor.tree_.children_right
125 | 
126 |     @predictor.setter
127 |     def predictor(self, predictor: Union[DecisionTreeClassifier, DecisionTreeRegressor]):
128 |         self._predictor = predictor
129 | 


--------------------------------------------------------------------------------
/psyke/extraction/cart/FairTree.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from collections import Counter
  3 | 
  4 | from sklearn.metrics import accuracy_score, r2_score
  5 | 
  6 | 
  7 | class Node:
  8 |     def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
  9 |         self.feature = feature
 10 |         self.threshold = threshold
 11 |         self.left = left
 12 |         self.right = right
 13 |         self.value = value
 14 | 
 15 |     def is_leaf_node(self):
 16 |         return self.value is not None
 17 | 
 18 | 
 19 | class FairTree:
 20 |     def __init__(self, max_depth=3, max_leaves=None, criterion=None, min_samples_split=2, lambda_penalty=0.0,
 21 |                  protected_attr=None):
 22 |         self.max_depth = max_depth
 23 |         self.max_leaves = max_leaves
 24 |         self.min_samples_split = min_samples_split
 25 |         self.lambda_penalty = lambda_penalty
 26 |         self.protected_attr = protected_attr
 27 |         self.criterion = criterion
 28 |         self.root = None
 29 |         self.n_leaves = 0
 30 |         self.quality_function = None
 31 | 
 32 |     def fit(self, X, y):
 33 |         self.n_leaves = 0
 34 |         self.root = self._grow_tree(X, y, depth=0)
 35 |         while self.n_leaves > self.max_leaves:
 36 |             self.prune_least_important_leaf(X, y)
 37 |             self.n_leaves -= 1
 38 |         return self
 39 | 
 40 |     @staticmethod
 41 |     def _estimate_output(y):
 42 |         raise NotImplementedError
 43 | 
 44 |     def score(self, X, y):
 45 |         raise NotImplementedError
 46 | 
 47 |     def predict(self, X):
 48 |         return np.array([self._traverse_tree(x, self.root) for _, x in X.iterrows()])
 49 | 
 50 |     def _traverse_tree(self, x, node):
 51 |         if node.is_leaf_node():
 52 |             return node.value
 53 |         if x[node.feature] <= node.threshold:
 54 |             return self._traverse_tree(x, node.left)
 55 |         return self._traverse_tree(x, node.right)
 56 | 
 57 |     def _grow_tree(self, X, y, depth):
 58 |         if depth >= self.max_depth or X.shape[0] < self.min_samples_split or len(set(y.values.flatten())) == 1 or \
 59 |                 (self.max_leaves is not None and self.n_leaves >= self.max_leaves):
 60 |             self.n_leaves += 1
 61 |             return Node(value=self._estimate_output(y))
 62 | 
 63 |         best_feature, best_threshold = self._best_split(X, y)
 64 |         if best_feature is None:
 65 |             self.n_leaves += 1
 66 |             return Node(value=self._estimate_output(y))
 67 | 
 68 |         left_idxs = X[best_feature] <= best_threshold
 69 |         right_idxs = X[best_feature] > best_threshold
 70 | 
 71 |         left = self._grow_tree(X[left_idxs], y[left_idxs], depth + 1)
 72 |         right = self._grow_tree(X[right_idxs], y[right_idxs], depth + 1)
 73 |         return Node(best_feature, best_threshold, left, right)
 74 | 
 75 |     @staticmethod
 76 |     def generate_thresholds(X, y):
 77 |         sorted_indices = np.argsort(X)
 78 |         X = np.array(X)[sorted_indices]
 79 |         y = np.array(y)[sorted_indices]
 80 |         return np.array([(X[i] + X[i - 1]) / 2.0 for i in range(1, len(X)) if y[i] != y[i - 1]])
 81 | 
 82 |     def _best_split(self, X, y):
 83 |         best_gain = -float('inf')
 84 |         split_idx, split_threshold = None, None
 85 | 
 86 |         for feature in [feature for feature in X.columns if feature not in self.protected_attr]:
 87 |             # for threshold in np.unique(np.quantile(X[feature], np.linspace(0, 1, num=25))):
 88 |             for threshold in self.generate_thresholds(X[feature], y):
 89 |                 left_idxs = X[feature] <= threshold
 90 |                 right_idxs = X[feature] > threshold
 91 | 
 92 |                 if left_idxs.sum() == 0 or right_idxs.sum() == 0:
 93 |                     continue
 94 | 
 95 |                 gain = self._fair_gain(y, left_idxs, right_idxs, X[self.protected_attr])
 96 | 
 97 |                 if gain > best_gain:
 98 |                     best_gain = gain
 99 |                     split_idx = feature
100 |                     split_threshold = threshold
101 |         return split_idx, split_threshold
102 | 
103 |     @staticmethod
104 |     def _disparity(group):
105 |         counts = Counter(group)
106 |         if len(counts) <= 1:
107 |             return 0.0
108 |         values = np.array(list(counts.values())) / len(group)
109 |         return np.abs(values[0] - values[1])
110 | 
111 |     def _fair_gain(self, y, left_idx, right_idx, protected):
112 |         child = len(y[left_idx]) / len(y) * self.quality_function(y[left_idx]) + \
113 |                 len(y[right_idx]) / len(y) * self.quality_function(y[right_idx])
114 |         info_gain = self.quality_function(y) - child
115 |         penalty = self._disparity(protected[left_idx]) + self._disparity(protected[right_idx])
116 |         return info_gain - self.lambda_penalty * penalty
117 | 
118 |     @staticmethod
119 |     def _match_path(x, path):
120 |         for node, left in path:
121 |             if left and x[node.feature] > node.threshold:
122 |                 return False
123 |             if not left and x[node.feature] <= node.threshold:
124 |                 return False
125 |         return True
126 | 
127 |     @staticmethod
128 |     def candidates(node, parent=None, is_left=None, path=[]):
129 |         if node is None or node.is_leaf_node():
130 |             return []
131 |         leaves = []
132 |         if node.left.is_leaf_node() and node.right.is_leaf_node():
133 |             leaves.append((node, parent, is_left, path))
134 |         leaves += FairTreeClassifier.candidates(node.left, node, True, path + [(node, True)])
135 |         leaves += FairTreeClassifier.candidates(node.right, node, False, path + [(node, False)])
136 |         return leaves
137 | 
138 |     def prune_least_important_leaf(self, X, y):
139 |         best_score = -np.inf
140 |         best_prune = None
141 | 
142 |         for node, parent, is_left, path in self.candidates(self.root):
143 |             original_left = node.left
144 |             original_right = node.right
145 | 
146 |             merged_y = y[(X.apply(lambda x: self._match_path(x, path), axis=1))]
147 |             if len(merged_y) == 0:
148 |                 continue
149 |             new_value = self._estimate_output(merged_y)
150 |             node.left = node.right = None
151 |             node.value = new_value
152 | 
153 |             score = self.score(X, y)
154 |             if score >= best_score:
155 |                 best_score = score
156 |                 best_prune = (node, new_value)
157 | 
158 |             node.left, node.right, node.value = original_left, original_right, None
159 | 
160 |         if best_prune:
161 |             best_prune[0].left = best_prune[0].right = None
162 |             best_prune[0].value = best_prune[1]
163 | 
164 | 
165 | class FairTreeClassifier(FairTree):
166 |     def __init__(self, max_depth=3, max_leaves=None, criterion='entropy', min_samples_split=2, lambda_penalty=0.0,
167 |                  protected_attr=None):
168 |         super().__init__(max_depth, max_leaves, criterion, min_samples_split, lambda_penalty, protected_attr)
169 |         self.quality_function = self._gini if self.criterion == 'gini' else self._entropy
170 | 
171 |     @staticmethod
172 |     def _estimate_output(y):
173 |         return Counter(y.values.flatten()).most_common(1)[0][0]
174 | 
175 |     def score(self, X, y):
176 |         return accuracy_score(y.values.flatten(), self.predict(X))
177 | 
178 |     @staticmethod
179 |     def _entropy(y):
180 |         ps = np.unique(y, return_counts=True)[1] / len(y)
181 |         return -np.sum([p * np.log2(p) for p in ps if p > 0])
182 | 
183 |     @staticmethod
184 |     def _gini(y):
185 |         return 1.0 - np.sum(np.unique(y, return_counts=True)[1] / len(y)**2)
186 | 
187 | 
188 | class FairTreeRegressor(FairTree):
189 |     def __init__(self, max_depth=3, max_leaves=None, criterion='mse', min_samples_split=2, lambda_penalty=0.0,
190 |                  protected_attr=None):
191 |         super().__init__(max_depth, max_leaves, criterion, min_samples_split, lambda_penalty, protected_attr)
192 |         self.quality_function = self._mse
193 | 
194 |     @staticmethod
195 |     def _estimate_output(y):
196 |         return np.mean(y.values.flatten())
197 | 
198 |     def score(self, X, y):
199 |         return r2_score(y.values.flatten(), self.predict(X))
200 | 
201 |     @staticmethod
202 |     def _mse(y):
203 |         y = y.values.flatten().astype(float)
204 |         return np.mean((y - np.mean(y))**2)
205 | 


--------------------------------------------------------------------------------
/psyke/extraction/cart/FairTreePredictor.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | from typing import Union, Any
 3 | 
 4 | from psyke.extraction.cart import FairTreeClassifier, FairTreeRegressor, LeafSequence, LeafConstraints
 5 | from psyke.extraction.cart.CartPredictor import CartPredictor
 6 | from psyke.schema import LessThan, GreaterThan, SchemaException, Value
 7 | 
 8 | 
 9 | class FairTreePredictor(CartPredictor):
10 |     """
11 |     A wrapper for fair decision and regression trees of psyke.
12 |     """
13 | 
14 |     def __init__(self, predictor: Union[FairTreeClassifier, FairTreeRegressor] = FairTreeClassifier(),
15 |                  discretization=None, normalization=None):
16 |         super().__init__(predictor, discretization, normalization)
17 | 
18 |     def __iter__(self) -> LeafSequence:
19 |         leaves = [node for node in self.recurse(self._predictor.root, {})]
20 |         return (leaf for leaf in leaves)
21 | 
22 |     @staticmethod
23 |     def merge_constraints(constraints: LeafConstraints, constraint: Value, feature: str):
24 |         if feature in constraints:
25 |             try:
26 |                 constraints[feature][-1] *= constraint
27 |             except SchemaException:
28 |                 constraints[feature].append(constraint)
29 |         else:
30 |             constraints[feature] = [constraint]
31 |         return constraints
32 | 
33 |     def recurse(self, node, constraints) -> Union[LeafSequence, tuple[LeafConstraints, Any]]:
34 |         if node.is_leaf_node():
35 |             return constraints, node.value
36 | 
37 |         feature = node.feature
38 |         threshold = node.threshold if self.normalization is None else \
39 |             (node.threshold * self.normalization[feature][1] + self.normalization[feature][0])
40 | 
41 |         left = self.recurse(node.left, self.merge_constraints(copy.deepcopy(constraints), LessThan(threshold), feature))
42 |         right = self.recurse(node.right, self.merge_constraints(copy.deepcopy(constraints),
43 |                                                                 GreaterThan(threshold), feature))
44 |         return (left if isinstance(left, list) else [left]) + (right if isinstance(right, list) else [right])
45 | 
46 |     @property
47 |     def predictor(self) -> Union[FairTreeClassifier, FairTreeRegressor]:
48 |         return self._predictor
49 | 
50 |     @property
51 |     def n_leaves(self) -> int:
52 |         return self._predictor.n_leaves
53 | 
54 |     @predictor.setter
55 |     def predictor(self, predictor: Union[FairTreeClassifier, FairTreeRegressor]):
56 |         self._predictor = predictor
57 | 


--------------------------------------------------------------------------------
/psyke/extraction/cart/__init__.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | 
 3 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 4 | 
 5 | from psyke.extraction import PedagogicalExtractor
 6 | from psyke import get_default_random_seed
 7 | from psyke.extraction.cart.FairTree import FairTreeClassifier, FairTreeRegressor
 8 | from psyke.schema import DiscreteFeature, Value
 9 | from tuprolog.theory import Theory
10 | from typing import Iterable, Any
11 | import pandas as pd
12 | 
13 | 
14 | TREE_SEED = get_default_random_seed()
15 | 
16 | LeafConstraints = dict[str, list[Value]]
17 | LeafSequence = Iterable[tuple[LeafConstraints, Any]]
18 | 
19 | 
20 | class Cart(PedagogicalExtractor, ABC):
21 | 
22 |     def __init__(self, predictor, max_depth: int = 3, max_leaves: int = None, max_features=None,
23 |                  discretization: Iterable[DiscreteFeature] = None,
24 |                  normalization=None, simplify: bool = True):
25 |         from psyke.extraction.cart.CartPredictor import CartPredictor
26 | 
27 |         super().__init__(predictor, discretization, normalization)
28 |         self.is_fair = None
29 |         self._cart_predictor = CartPredictor(discretization=discretization, normalization=normalization)
30 |         self.depth = max_depth
31 |         self.leaves = max_leaves
32 |         self.max_features = max_features
33 |         self._simplify = simplify
34 | 
35 |     def _extract(self, data: pd.DataFrame) -> Theory:
36 |         from psyke.extraction.cart.FairTreePredictor import FairTreePredictor
37 | 
38 |         if self.is_fair:
39 |             self._cart_predictor = FairTreePredictor(discretization=self.discretization,
40 |                                                      normalization=self.normalization)
41 |             fair_tree = FairTreeClassifier if isinstance(data.iloc[0, -1], str) else FairTreeRegressor
42 |             self._cart_predictor.predictor = fair_tree(max_depth=self.depth, max_leaves=self.leaves,
43 |                                                        protected_attr=self.is_fair)
44 |         else:
45 |             tree = DecisionTreeClassifier if isinstance(data.iloc[0, -1], str) else DecisionTreeRegressor
46 |             self._cart_predictor.predictor = tree(random_state=TREE_SEED, max_depth=self.depth,
47 |                                                   max_leaf_nodes=self.leaves, max_features=self.max_features)
48 |         self._cart_predictor.predictor.fit(data.iloc[:, :-1], data.iloc[:, -1])
49 |         return self._cart_predictor.create_theory(data, self._simplify)
50 | 
51 |     def make_fair(self, features: Iterable[str]):
52 |         self.is_fair = features
53 | 
54 |     def _predict(self, dataframe: pd.DataFrame) -> Iterable:
55 |         return self._cart_predictor.predict(dataframe)
56 | 
57 |     def predict_why(self, data: dict[str, float], verbose=True):
58 |         prediction = None
59 |         conditions = {}
60 |         if self.normalization is not None:
61 |             data = {k: v * self.normalization[k][1] + self.normalization[k][0] if k in self.normalization else v
62 |                     for k, v in data.items()}
63 |         for conditions, prediction in self._cart_predictor:
64 |             if all(all(interval.is_in(data[variable]) for interval in intervals)
65 |                    for variable, intervals in conditions.items()):
66 |                 break
67 |         return prediction, conditions
68 | 
69 |     @property
70 |     def n_rules(self) -> int:
71 |         return self._cart_predictor.n_leaves
72 | 


--------------------------------------------------------------------------------
/psyke/extraction/hypercubic/__init__.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from abc import ABC
  4 | from collections import Iterable
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from sklearn.base import ClassifierMixin
  9 | from sklearn.feature_selection import SelectKBest, f_regression, f_classif
 10 | from sklearn.linear_model import LinearRegression
 11 | from tuprolog.core import Var, Struct, clause
 12 | from tuprolog.theory import Theory, mutable_theory
 13 | from psyke.extraction import PedagogicalExtractor
 14 | from psyke.extraction.hypercubic.hypercube import HyperCube, RegressionCube, ClassificationCube, ClosedCube, Point, \
 15 |     GenericCube
 16 | from psyke.hypercubepredictor import HyperCubePredictor
 17 | from psyke.schema import Value
 18 | from psyke.utils.logic import create_variable_list, create_head, to_var, Simplifier
 19 | from psyke.utils import Target
 20 | from psyke.extraction.hypercubic.strategy import Strategy, FixedStrategy
 21 | 
 22 | 
 23 | class HyperCubeExtractor(HyperCubePredictor, PedagogicalExtractor, ABC):
 24 |     def __init__(self, predictor, output, discretization=None, normalization=None):
 25 |         HyperCubePredictor.__init__(self, output=output, normalization=normalization)
 26 |         PedagogicalExtractor.__init__(self, predictor, discretization=discretization, normalization=normalization)
 27 |         self._default_surrounding_cube = False
 28 | 
 29 |     def _default_cube(self) -> HyperCube | RegressionCube | ClassificationCube:
 30 |         if self._output == Target.CONSTANT:
 31 |             return HyperCube()
 32 |         if self._output == Target.REGRESSION:
 33 |             return RegressionCube()
 34 |         return ClassificationCube()
 35 | 
 36 |     def _sort_cubes(self):
 37 |         cubes = [(cube.diversity, i, cube) for i, cube in enumerate(self._hypercubes)]
 38 |         cubes.sort()
 39 |         self._hypercubes = [cube[2] for cube in cubes]
 40 | 
 41 |     def extract(self, dataframe: pd.DataFrame) -> Theory:
 42 |         theory = PedagogicalExtractor.extract(self, dataframe)
 43 |         self._surrounding = HyperCube.create_surrounding_cube(dataframe, output=self._output)
 44 |         self._surrounding.update(dataframe, self.predictor)
 45 |         return theory
 46 | 
 47 |     def pairwise_fairness(self, data: dict[str, float], neighbor: dict[str, float]):
 48 |         cube1 = self._find_cube(data)
 49 |         cube2 = self._find_cube(neighbor)
 50 |         different_prediction_reasons = []
 51 | 
 52 |         if cube1.output == cube2.output:
 53 |             print("Prediction", cube1.output, "is FAIR")
 54 |         else:
 55 |             print("Prediction", cube1.output, "may be UNFAIR")
 56 |             print("It could be", cube2.output, "if:")
 57 |             for d in data:
 58 |                 a, b = cube2.dimensions[d]
 59 |                 if data[d] < a:
 60 |                     print('    ', d, 'increases above', round(a, 1))
 61 |                     different_prediction_reasons.append(d)
 62 |                 elif data[d] > b:
 63 |                     print('    ', d, 'decreases below', round(b, 1))
 64 |                     different_prediction_reasons.append(d)
 65 |         return different_prediction_reasons
 66 | 
 67 |     def predict_counter(self, data: dict[str, float], verbose=True, only_first=True):
 68 |         output = ""
 69 |         prediction = None
 70 |         cube = self._find_cube(data)
 71 |         if cube is None:
 72 |             output += "The extracted knowledge is not exhaustive; impossible to predict this instance"
 73 |         else:
 74 |             prediction = self._predict_from_cubes(data)
 75 |             output += f"The output is {prediction}\n"
 76 | 
 77 |         point = Point(list(data.keys()), list(data.values()))
 78 |         cubes = self._hypercubes if cube is None else [c for c in self._hypercubes if cube.output != c.output]
 79 |         cubes = sorted([(cube.surface_distance(point), cube.volume(), i, cube) for i, cube in enumerate(cubes)])
 80 | 
 81 |         counter_conditions = []
 82 | 
 83 |         for _, _, _, c in cubes:
 84 |             if not only_first or c.output not in [o for o, _ in counter_conditions]:
 85 |                 counter_conditions.append((c.output, {c: [val for val in v if val is not None and not val.is_in(
 86 |                     self.unscale(data[c], c))] for c, v in self.__get_conditions(data, c).items()}))
 87 | 
 88 |         if verbose:
 89 |             for o, conditions in counter_conditions:
 90 |                 output += f"The output may be {o} if\n" + HyperCubeExtractor.__conditions_to_string(conditions)
 91 |             print(output)
 92 | 
 93 |         return prediction, counter_conditions
 94 | 
 95 |     @staticmethod
 96 |     def __conditions_to_string(conditions: dict[str, list[Value]]) -> str:
 97 |         output = ""
 98 |         for d in conditions:
 99 |             for i, condition in enumerate(conditions[d]):
100 |                 if i == 0:
101 |                     output += f'     {d} is '
102 |                 else:
103 |                     output += ' and '
104 |                 output += condition.print()
105 |                 if i + 1 == len(conditions[d]):
106 |                     output += '\n'
107 |         return output
108 | 
109 |     def __get_conditions(self, data: dict[str, float], cube: GenericCube) -> dict[str, list[Value]]:
110 |         conditions = {d: [cube.interval_to_value(d, self.unscale)] for d in data.keys()
111 |                       if d not in self._dimensions_to_ignore}
112 |         for c in cube.subcubes(self._hypercubes):
113 |             for d in conditions:
114 |                 condition = c.interval_to_value(d, self.unscale)
115 |                 if condition is None:
116 |                     continue
117 |                 elif conditions[d][-1] is None:
118 |                     conditions[d][-1] = -condition
119 |                 else:
120 |                     try:
121 |                         conditions[d][-1] *= -condition
122 |                     except Exception:
123 |                         conditions[d].append(-condition)
124 |         return conditions
125 | 
126 |     def predict_why(self, data: dict[str, float], verbose=True):
127 |         cube = self._find_cube(data)
128 |         output = ""
129 |         if cube is None:
130 |             output += "The extracted knowledge is not exhaustive; impossible to predict this instance\n"
131 |             if verbose:
132 |                 print(output)
133 |             return None, {}
134 |         prediction = self._predict_from_cubes(data)
135 |         output += f"The output is {prediction} because\n"
136 |         conditions = {c: [val for val in v if val is not None and val.is_in(self.unscale(data[c], c))]
137 |                       for c, v in self.__get_conditions(data, cube).items()}
138 | 
139 |         if verbose:
140 |             output += HyperCubeExtractor.__conditions_to_string(conditions)
141 |             print(output)
142 | 
143 |         return prediction, conditions
144 | 
145 |     @staticmethod
146 |     def _create_head(dataframe: pd.DataFrame, variables: list[Var], output: float | LinearRegression) -> Struct:
147 |         return create_head(dataframe.columns[-1], variables[:-1], output) \
148 |             if not isinstance(output, LinearRegression) else \
149 |             create_head(dataframe.columns[-1], variables[:-1], variables[-1])
150 | 
151 |     def __drop(self, dataframe: pd.DataFrame):
152 |         self._hypercubes = [cube for cube in self._hypercubes if cube.count(dataframe) > 1]
153 | 
154 |     def _create_theory(self, dataframe: pd.DataFrame) -> Theory:
155 |         # self.__drop(dataframe)
156 |         for cube in self._hypercubes:
157 |             for dimension in cube.dimensions:
158 |                 if abs(cube[dimension][0] - self._surrounding[dimension][0]) < HyperCube.EPSILON * 2:
159 |                     cube.set_infinite(dimension, '-')
160 |                 if abs(cube[dimension][1] - self._surrounding[dimension][1]) < HyperCube.EPSILON * 2:
161 |                     cube.set_infinite(dimension, '+')
162 | 
163 |         if self._default_surrounding_cube:
164 |             self._hypercubes[-1].set_default()
165 | 
166 |         new_theory = mutable_theory()
167 |         for cube in self._hypercubes:
168 |             variables = create_variable_list([], dataframe)
169 |             variables[dataframe.columns[-1]] = to_var(dataframe.columns[-1])
170 |             head = HyperCubeExtractor._create_head(dataframe, list(variables.values()),
171 |                                                    self.unscale(cube.output, dataframe.columns[-1]))
172 |             body = cube.body(variables, self._dimensions_to_ignore, self.unscale, self.normalization)
173 |             new_theory.assertZ(clause(head, body))
174 |         return HyperCubeExtractor._prettify_theory(new_theory)
175 | 
176 |     @staticmethod
177 |     def _prettify_theory(theory: Theory) -> Theory:
178 |         visitor = Simplifier()
179 |         new_clauses = []
180 |         for c in theory.clauses:
181 |             body = c.body
182 |             structs = body.unfolded if c.body_size > 1 else [body]
183 |             new_structs = []
184 |             for s in structs:
185 |                 new_structs.append(s.accept(visitor))
186 |             new_clauses.append(clause(c.head, new_structs))
187 |         return mutable_theory(new_clauses)
188 | 
189 | 
190 | class FeatureRanker:
191 |     def __init__(self, feat):
192 |         self.scores = None
193 |         self.feat = feat
194 | 
195 |     def fit(self, model, samples):
196 |         predictions = np.array(model.predict(samples)).flatten()
197 |         function = f_classif if isinstance(model, ClassifierMixin) else f_regression
198 |         best = SelectKBest(score_func=function, k="all").fit(samples, predictions)
199 |         self.scores = np.array(best.scores_) / max(best.scores_)
200 |         return self
201 | 
202 |     def fit_on_data(self, samples):
203 |         function = f_classif if isinstance(samples.iloc[0, -1], str) else f_regression
204 |         best = SelectKBest(score_func=function, k="all").fit(samples.iloc[:, :-1], samples.iloc[:, -1])
205 |         self.scores = np.array(best.scores_) / max(best.scores_)
206 |         return self
207 | 
208 |     def rankings(self):
209 |         return list(zip(self.feat, self.scores))
210 | 
211 | 
212 | class Grid:
213 |     def __init__(self, iterations: int = 1, strategy: Strategy | Iterable[Strategy] = FixedStrategy()):
214 |         self.iterations = iterations
215 |         self.strategy = strategy
216 | 
217 |     def make_fair(self, features: Iterable[str]):
218 |         if isinstance(self.strategy, Strategy):
219 |             self.strategy.make_fair(features)
220 |         elif isinstance(self.strategy, Iterable):
221 |             [strategy.make_fair(features) for strategy in self.strategy]
222 | 
223 |     def get(self, feature: str, depth: int) -> int:
224 |         if isinstance(self.strategy, list):
225 |             return self.strategy[depth].get(feature)
226 |         else:
227 |             return self.strategy.get(feature)
228 | 
229 |     def iterate(self) -> range:
230 |         return range(self.iterations)
231 | 
232 |     def __repr__(self):
233 |         return self.__str__()
234 | 
235 |     def __str__(self):
236 |         return "Grid ({}). {}".format(self.iterations, self.strategy)
237 | 
238 | 
239 | class Node:
240 |     def __init__(self, dataframe: pd.DataFrame, cube: ClosedCube = None):
241 |         self.dataframe = dataframe
242 |         self.cube: ClosedCube = cube
243 |         self.right: Node | None = None
244 |         self.left: Node | None = None
245 | 
246 |     @property
247 |     def children(self) -> list[Node]:
248 |         return [self.right, self.left]
249 | 
250 |     def search(self, point: dict[str, float]) -> ClosedCube:
251 |         if self.right is None:
252 |             return self.cube
253 |         if point in self.right.cube:
254 |             return self.right.search(point)
255 |         return self.left.search(point)
256 | 
257 |     @property
258 |     def leaves(self):
259 |         if self.right is None:
260 |             return 1
261 |         return self.right.leaves + self.left.leaves
262 | 


--------------------------------------------------------------------------------
/psyke/extraction/hypercubic/cosmik/__init__.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from sklearn.mixture import GaussianMixture
 4 | from tuprolog.theory import Theory
 5 | 
 6 | from psyke import Target, Extractor, get_default_random_seed
 7 | from psyke.clustering.utils import select_gaussian_mixture
 8 | from psyke.extraction.hypercubic import HyperCube, HyperCubeExtractor, RegressionCube
 9 | 
10 | 
11 | class COSMiK(HyperCubeExtractor):
12 |     """
13 |     Explanator implementing COSMiK algorithm.
14 |     """
15 | 
16 |     def __init__(self, predictor, max_components: int = 4, k: int = 5, patience: int = 15, close_to_center: bool = True,
17 |                  output: Target = Target.CONSTANT, discretization=None, normalization=None,
18 |                  seed: int = get_default_random_seed()):
19 |         super().__init__(predictor, Target.REGRESSION, discretization, normalization)
20 |         self.max = max_components
21 |         self.k = k
22 |         self.patience = patience
23 |         self.output = output
24 |         self.close_to_center = close_to_center
25 |         self.seed = seed
26 | 
27 |     def _extract(self, dataframe: pd.DataFrame) -> Theory:
28 |         np.random.seed(self.seed)
29 |         X, y = dataframe.iloc[:, :-1], dataframe.iloc[:, -1]
30 | 
31 |         _, n, _ = select_gaussian_mixture(dataframe, self.max)
32 |         gmm = GaussianMixture(n)
33 |         gmm.fit(X, y)
34 | 
35 |         divine = Extractor.divine(gmm, self.k, self.patience, self.close_to_center,
36 |                                   self.discretization, self.normalization)
37 |         df = X.join(pd.DataFrame(gmm.predict(X)))
38 |         df.columns = dataframe.columns
39 |         divine.extract(df)
40 | 
41 |         self._hypercubes = [HyperCube(cube.dimensions.copy()) if self.output == Target.CONSTANT else
42 |                             RegressionCube(cube.dimensions.copy()) for cube in divine._hypercubes]
43 |         for cube in self._hypercubes:
44 |             cube.update(dataframe, self.predictor)
45 | 
46 |         self._sort_cubes()
47 |         return self._create_theory(dataframe)


--------------------------------------------------------------------------------
/psyke/extraction/hypercubic/creepy/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from collections import Iterable
 4 | import numpy as np
 5 | import pandas as pd
 6 | from sklearn.base import ClassifierMixin
 7 | from tuprolog.theory import Theory
 8 | from psyke import Clustering
 9 | from psyke.clustering import HyperCubeClustering
10 | from psyke.extraction.hypercubic import HyperCubeExtractor
11 | from psyke.utils import Target, get_default_random_seed
12 | 
13 | 
14 | class CReEPy(HyperCubeExtractor):
15 |     """
16 |     Explanator implementing CReEPy algorithm.
17 |     """
18 | 
19 |     def __init__(self, predictor, clustering=Clustering.exact, depth: int = 3, error_threshold: float = 0.1,
20 |                  output: Target = Target.CONSTANT, gauss_components: int = 5, ranks: Iterable[(str, float)] = tuple(),
21 |                  ignore_threshold: float = 0.0, discretization=None, normalization=None,
22 |                  seed: int = get_default_random_seed()):
23 |         super().__init__(predictor, Target.CLASSIFICATION if isinstance(predictor, ClassifierMixin) else output,
24 |                          discretization, normalization)
25 |         self.clustering = clustering(depth, error_threshold, self._output, gauss_components, discretization,
26 |                                      normalization, seed)
27 |         self._default_surrounding_cube = True
28 |         self._dimensions_to_ignore = set([dimension for dimension, relevance in ranks if relevance < ignore_threshold])
29 | 
30 |     def _extract(self, dataframe: pd.DataFrame) -> Theory:
31 |         if not isinstance(self.clustering, HyperCubeClustering):
32 |             raise TypeError("clustering must be a HyperCubeClustering")
33 | 
34 |         self.clustering.fit(dataframe)
35 |         self._hypercubes = self.clustering.get_hypercubes()
36 |         self._surrounding = self._hypercubes[-1]
37 |         return self._create_theory(dataframe)
38 | 


--------------------------------------------------------------------------------
/psyke/extraction/hypercubic/divine/__init__.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from tuprolog.theory import Theory
 4 | 
 5 | from psyke import Target, get_default_random_seed
 6 | from psyke.extraction.hypercubic import HyperCubeExtractor
 7 | from psyke.extraction.hypercubic.hypercube import Point, GenericCube, HyperCube
 8 | 
 9 | from sklearn.neighbors import BallTree
10 | 
11 | 
12 | class DiViNE(HyperCubeExtractor):
13 |     """
14 |     Explanator implementing DiViNE algorithm.
15 |     """
16 | 
17 |     def __init__(self, predictor, k: int = 5, patience: int = 15, close_to_center: bool = True,
18 |                  discretization=None, normalization=None, seed: int = get_default_random_seed()):
19 |         super().__init__(predictor, Target.CLASSIFICATION, discretization, normalization)
20 |         self.k = k
21 |         self.patience = patience
22 |         self.vicinity_function = DiViNE.closest_to_center if close_to_center else DiViNE.closest_to_corners
23 |         self.seed = seed
24 | 
25 |     @staticmethod
26 |     def __pop(data: pd.DataFrame, idx: int = None) -> (Point, pd.DataFrame):
27 |         if idx is None:
28 |             idx = data.sample(1).index.values[0]
29 |         t = data.T
30 |         return DiViNE.__to_point(t.pop(idx)), t.T.reset_index(drop=True)
31 | 
32 |     @staticmethod
33 |     def __to_point(instance) -> Point:
34 |         point = Point(instance.index.values, instance.values)
35 |         return point
36 | 
37 |     def __to_cube(self, point: Point) -> GenericCube:
38 |         cube = HyperCube.cube_from_point(point.dimensions, self._output)
39 |         cube._output = list(point.dimensions.values())[-1]
40 |         return cube
41 | 
42 |     def __clean(self, data: pd.DataFrame) -> pd.DataFrame:
43 |         _, idx = BallTree(data.iloc[:, :-1]).query(data.iloc[:, :-1], k=self.k)
44 |         # how many output classes are associated with the k neighbors
45 |         count = np.array(list(map(lambda indices: len(data.iloc[indices].iloc[:, -1].unique()), idx)))
46 |         # instances with neighbors of different classes are discarded
47 |         return data[count == 1]
48 | 
49 |     def __closest(self, data: pd.DataFrame, cube: GenericCube) -> (Point, pd.DataFrame):
50 |         return DiViNE.__pop(data, self.vicinity_function(BallTree(data.iloc[:, :-1]), cube))
51 | 
52 |     @staticmethod
53 |     def closest_to_center(tree: BallTree, cube: GenericCube):
54 |         return tree.query([list(cube.center.dimensions.values())], k=1)[1][0][-1]
55 | 
56 |     @staticmethod
57 |     def closest_to_corners(tree: BallTree, cube: GenericCube):
58 |         distance, idx = tree.query([list(point.dimensions.values()) for point in cube.corners()], k=1)
59 |         return idx[np.argmin(distance)][-1]
60 | 
61 |     def _extract(self, dataframe: pd.DataFrame) -> Theory:
62 |         self._surrounding = HyperCube.create_surrounding_cube(dataframe, output=Target.CLASSIFICATION)
63 |         np.random.seed(self.seed)
64 |         data = self.__clean(dataframe)
65 | 
66 |         while len(data) > 0:
67 |             discarded = []
68 |             patience = self.patience
69 |             point, data = self.__pop(data)
70 |             cube = self.__to_cube(point)
71 | 
72 |             while patience > 0 and len(data) > 0:
73 |                 other, data = self.__closest(data, cube)
74 |                 if cube.output == list(other.dimensions.values())[-1]:
75 |                     cube = cube.merge_with_point(other)
76 |                     data = data[~(cube.filter_indices(data.iloc[:, :-1]))].reset_index(drop=True)
77 |                 else:
78 |                     patience -= 1
79 |                     discarded.append(other)
80 |             if cube.volume() > 0:
81 |                 cube.update(dataframe, self.predictor)
82 |                 self._hypercubes.append(cube)
83 |             if len(discarded) > 0:
84 |                 data = pd.concat([data] + [d.to_dataframe() for d in discarded]).reset_index(drop=True)
85 |         self._sort_cubes()
86 |         return self._create_theory(dataframe)
87 | 


--------------------------------------------------------------------------------
/psyke/extraction/hypercubic/gridex/__init__.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from itertools import product
  3 | from typing import Iterable
  4 | import numpy as np
  5 | import pandas as pd
  6 | from sklearn.base import ClassifierMixin
  7 | from tuprolog.theory import Theory
  8 | from psyke import get_default_random_seed
  9 | from psyke.utils import Target
 10 | from psyke.extraction.hypercubic import HyperCubeExtractor, Grid, HyperCube
 11 | 
 12 | 
 13 | class GridEx(HyperCubeExtractor):
 14 |     """
 15 |     Explanator implementing GridEx algorithm, doi:10.1007/978-3-030-82017-6_2.
 16 |     """
 17 | 
 18 |     def __init__(self, predictor, grid: Grid, min_examples: int, threshold: float, output: Target = Target.CONSTANT,
 19 |                  discretization=None, normalization=None, seed: int = get_default_random_seed()):
 20 |         super().__init__(predictor, Target.CLASSIFICATION if isinstance(predictor, ClassifierMixin) else output,
 21 |                          discretization, normalization)
 22 |         self.grid = grid
 23 |         self.min_examples = min_examples
 24 |         self.threshold = threshold
 25 |         np.random.seed(seed)
 26 | 
 27 |     def _extract(self, dataframe: pd.DataFrame) -> Theory:
 28 |         self._hypercubes = []
 29 |         self._surrounding = HyperCube.create_surrounding_cube(dataframe, output=self._output)
 30 |         self._surrounding.init_diversity(2 * self.threshold)
 31 |         self._iterate(dataframe)
 32 |         return self._create_theory(dataframe)
 33 | 
 34 |     def _create_ranges(self, cube, iteration):
 35 |         ranges = {}
 36 |         for (feature, (a, b)) in cube.dimensions.items():
 37 |             n_bins = self.grid.get(feature, iteration)
 38 |             if n_bins == 1:
 39 |                 ranges[feature] = [(a, b)]
 40 |                 self._dimensions_to_ignore.add(feature)
 41 |             else:
 42 |                 size = (b - a) / n_bins
 43 |                 ranges[feature] = [(a + size * i, a + size * (i + 1)) for i in range(n_bins)]
 44 |         return ranges
 45 | 
 46 |     def _cubes_to_split(self, cube, iteration, dataframe, fake, keep_empty=False):
 47 |         to_split = []
 48 |         for p in product(*self._create_ranges(cube, iteration).values()):
 49 |             cube = self._default_cube()
 50 |             for i, f in enumerate(dataframe.columns[:-1]):
 51 |                 cube.update_dimension(f, p[i])
 52 |             n = cube.count(dataframe)
 53 |             if n > 0 or keep_empty:
 54 |                 fake = pd.concat([fake, cube.create_samples(self.min_examples - n)])
 55 |                 cube.update(fake, self.predictor)
 56 |                 to_split.append(cube)
 57 |         return to_split, fake
 58 | 
 59 |     def _iterate(self, dataframe: pd.DataFrame):
 60 |         fake = dataframe.copy()
 61 |         prev = [self._surrounding]
 62 |         next_iteration = []
 63 | 
 64 |         for iteration in self.grid.iterate():
 65 |             next_iteration = []
 66 |             for cube in prev:
 67 |                 if cube.count(dataframe) == 0:
 68 |                     continue
 69 |                 if cube.diversity < self.threshold:
 70 |                     self._hypercubes += [cube]
 71 |                     continue
 72 |                 to_split, fake = self._cubes_to_split(cube, iteration, dataframe, fake)
 73 |                 next_iteration += [c for c in self._merge(to_split, fake)]
 74 |             prev = next_iteration.copy()
 75 |         self._hypercubes += [cube for cube in next_iteration]
 76 | 
 77 |     @staticmethod
 78 |     def _find_couples(to_split: Iterable[HyperCube], not_in_cache: Iterable[HyperCube],
 79 |                       adjacent_cache: dict[tuple[HyperCube, HyperCube], str | None]) -> \
 80 |             Iterable[tuple[HyperCube, HyperCube, str]]:
 81 |         checked = []
 82 |         eligible = []
 83 |         for cube in to_split:
 84 |             checked.append(cube)
 85 |             for other_cube in [c for c in to_split if c not in checked]:
 86 |                 if (cube in not_in_cache) or (other_cube in not_in_cache):
 87 |                     adjacent_cache[(cube, other_cube)] = cube.is_adjacent(other_cube)
 88 |                 adjacent_feature = adjacent_cache[(cube, other_cube)]
 89 |                 eligible.append((cube, other_cube, adjacent_feature))
 90 |         return [couple for couple in eligible if couple[2] is not None]
 91 | 
 92 |     def _evaluate_merge(self, not_in_cache: Iterable[HyperCube],
 93 |                         dataframe: pd.DataFrame, feature: str,
 94 |                         cube: HyperCube, other_cube: HyperCube,
 95 |                         merge_cache: dict[(HyperCube, HyperCube), HyperCube | None]) -> bool:
 96 |         if (cube in not_in_cache) or (other_cube in not_in_cache):
 97 |             merged_cube = cube.merge_along_dimension(other_cube, feature)
 98 |             merged_cube.update(dataframe, self.predictor)
 99 |             merge_cache[(cube, other_cube)] = merged_cube
100 |         return cube.output == other_cube.output if self._output == Target.CLASSIFICATION else \
101 |             merge_cache[(cube, other_cube)].diversity < self.threshold
102 | 
103 |     def _merge(self, to_split: Iterable[HyperCube], dataframe: pd.DataFrame) -> Iterable[HyperCube]:
104 |         not_in_cache = [cube for cube in to_split]
105 |         adjacent_cache = {}
106 |         merge_cache = {}
107 |         cont = True
108 |         while cont:
109 |             to_merge = [([cube, other_cube], merge_cache[(cube, other_cube)]) for cube, other_cube, feature in
110 |                         GridEx._find_couples(to_split, not_in_cache, adjacent_cache) if
111 |                         self._evaluate_merge(not_in_cache, dataframe, feature, cube, other_cube, merge_cache)]
112 |             if len(to_merge) == 0:
113 |                 cont = False
114 |             else:
115 |                 sorted(to_merge, key=lambda c: c[1].diversity)
116 |                 best = to_merge[0]
117 |                 to_split = [cube for cube in to_split if cube not in best[0]] + [best[1]]
118 |                 not_in_cache = [best[1]]
119 |         return to_split
120 | 
121 |     def make_fair(self, features: Iterable[str]):
122 |         self.grid.make_fair(features)
123 | 


--------------------------------------------------------------------------------
/psyke/extraction/hypercubic/gridrex/__init__.py:
--------------------------------------------------------------------------------
 1 | from psyke import get_default_random_seed, Target
 2 | from psyke.extraction.hypercubic import Grid, RegressionCube
 3 | from psyke.extraction.hypercubic.gridex import GridEx
 4 | 
 5 | 
 6 | class GridREx(GridEx):
 7 |     """
 8 |     Explanator implementing GridREx algorithm.
 9 |     """
10 | 
11 |     def __init__(self, predictor, grid: Grid, min_examples: int, threshold: float, normalization,
12 |                  seed=get_default_random_seed()):
13 |         super().__init__(predictor, grid, min_examples, threshold, Target.REGRESSION, None, normalization, seed)
14 | 
15 |     def _default_cube(self) -> RegressionCube:
16 |         return RegressionCube()
17 | 


--------------------------------------------------------------------------------
/psyke/extraction/hypercubic/hex/__init__.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import Iterable
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | from psyke import get_default_random_seed, Target
  9 | from psyke.extraction.hypercubic import Grid, HyperCube, GenericCube, ClassificationCube
 10 | from psyke.extraction.hypercubic.gridex import GridEx
 11 | 
 12 | 
 13 | class HEx(GridEx):
 14 |     """
 15 |     Explanator implementing HEx algorithm.
 16 |     """
 17 | 
 18 |     class Node:
 19 |         def __init__(self, cube: GenericCube, parent: HEx.Node = None, threshold: float = None):
 20 |             self.cube = cube
 21 |             self.parent = parent
 22 |             self.children: Iterable[HEx.Node] = []
 23 |             self.threshold = threshold
 24 |             self.gain = True if parent is None else self.check()
 25 | 
 26 |         def check(self) -> bool:
 27 |             other = self.parent
 28 |             try:
 29 |                 while not other.gain:
 30 |                     other = other.parent
 31 |             except AttributeError:
 32 |                 return True
 33 |             if isinstance(other.cube, ClassificationCube):
 34 |                 return other.cube.output != self.cube.output
 35 |             return other.cube.error - self.cube.error > self.threshold * .6
 36 | 
 37 |         def indices(self, dataframe: pd.DataFrame):
 38 |             return self.cube.filter_indices(dataframe.iloc[:, :-1])
 39 | 
 40 |         def eligible_children(self, dataframe) -> Iterable[HEx.Node]:
 41 |             return [c for c in self.children if c.cube.count(dataframe) > 0]
 42 | 
 43 |         def permanent_children(self, dataframe) -> Iterable[HEx.Node]:
 44 |             return [c for c in self.eligible_children(dataframe) if c.gain]
 45 | 
 46 |         def permanent_indices(self, dataframe):
 47 |             return np.any([c.cube.filter_indices(dataframe.iloc[:, :-1])
 48 |                            for c in self.eligible_children(dataframe) if c.gain], axis=0)
 49 | 
 50 |         def update(self, dataframe: pd.DataFrame, predictor, recursive=False):
 51 |             if recursive:
 52 |                 for node in self.children:
 53 |                     node.update(dataframe, predictor, recursive)
 54 |             cleaned = [(c.cube, c.gain) for c in self.eligible_children(dataframe)]
 55 |             idx = self.permanent_indices(dataframe)
 56 | 
 57 |             if sum(g for _, g in cleaned) > 0 and sum(self.indices(dataframe)) > sum(idx) and self.gain:
 58 |                 self.cube.update(dataframe[self.indices(dataframe) & ~idx], predictor)
 59 |             return cleaned
 60 | 
 61 |         def linearize(self, dataframe, depth=1):
 62 |             children = [c.linearize(dataframe, depth + 1) for c in self.permanent_children(dataframe)]
 63 |             return [(cc, dd) for c in children for cc, dd in c if c != []] + \
 64 |                    [(c, depth) for c in self.permanent_children(dataframe)]
 65 | 
 66 |     def __init__(self, predictor, grid: Grid, min_examples: int, threshold: float, output: Target = Target.CONSTANT,
 67 |                  discretization=None, normalization=None, seed: int = get_default_random_seed()):
 68 |         super().__init__(predictor, grid, min_examples, threshold, output, discretization, normalization, seed)
 69 |         self._default_surrounding_cube = True
 70 | 
 71 |     def _gain(self, parent_cube: GenericCube, new_cube: GenericCube) -> float:
 72 |         if isinstance(parent_cube, ClassificationCube):
 73 |             return parent_cube.output != new_cube.output
 74 |         return parent_cube.error - new_cube.error > self.threshold * .6
 75 | 
 76 |     def _iterate(self, dataframe: pd.DataFrame):
 77 |         fake = dataframe.copy()
 78 |         self._surrounding.update(dataframe, self.predictor)
 79 |         root = HEx.Node(self._surrounding, threshold=self.threshold)
 80 |         current = [root]
 81 | 
 82 |         for iteration in self.grid.iterate():
 83 |             next_iteration = []
 84 |             for node in current:
 85 |                 if node.cube.diversity < self.threshold:
 86 |                     continue
 87 |                 children, fake = self._cubes_to_split(node.cube, iteration, dataframe, fake, True)
 88 |                 node.children = [HEx.Node(c, node, threshold=self.threshold) for c in children]
 89 |                 cleaned = node.update(fake, self.predictor, False)
 90 |                 node.children = [HEx.Node(c, node, threshold=self.threshold) for c in self._merge(
 91 |                     [c for c, _ in cleaned], fake)]
 92 |                 next_iteration += [n for n in node.children]
 93 | 
 94 |             current = next_iteration.copy()
 95 |         _ = root.update(fake, self.predictor, True)
 96 |         self._hypercubes = []
 97 |         linearized = root.linearize(fake)
 98 |         for depth in sorted(np.unique([d for (_, d) in linearized]), reverse=True):
 99 |             self._hypercubes += self._merge([c.cube for (c, d) in linearized if d == depth], fake)
100 | 
101 |         if len(self._hypercubes) == 0:
102 |             self._hypercubes = [self._surrounding]
103 |         elif not min(np.any([c.filter_indices(dataframe.iloc[:, :-1]) for c in self._hypercubes], axis=0)):
104 |             self._hypercubes = self._hypercubes + [self._surrounding]
105 | 


--------------------------------------------------------------------------------
/psyke/extraction/hypercubic/iter/__init__.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from typing import Iterable
  3 | import numpy as np
  4 | import pandas as pd
  5 | from sklearn.base import ClassifierMixin
  6 | from tuprolog.theory import Theory
  7 | from psyke.extraction.hypercubic import HyperCube, HyperCubeExtractor
  8 | from psyke.extraction.hypercubic.hypercube import GenericCube
  9 | from psyke.extraction.hypercubic.utils import MinUpdate, Expansion
 10 | from psyke.utils import get_default_random_seed, Target
 11 | 
 12 | 
 13 | class ITER(HyperCubeExtractor):
 14 |     """
 15 |     Explanator implementing ITER algorithm, doi:10.1007/11823728_26.
 16 |     """
 17 | 
 18 |     def __init__(self, predictor, min_update, n_points, max_iterations, min_examples, threshold, fill_gaps,
 19 |                  ignore_dimensions: Iterable, normalization, output: Target = Target.CONSTANT,
 20 |                  seed=get_default_random_seed()):
 21 |         super().__init__(predictor, output, normalization=normalization)
 22 |         if output is Target.REGRESSION:
 23 |             raise NotImplementedError
 24 |         self.predictor = predictor
 25 |         self.min_update = min_update
 26 |         self._init_points = n_points
 27 |         self.n_points = n_points
 28 |         self.max_iterations = max_iterations
 29 |         self.min_examples = min_examples
 30 |         self.threshold = threshold
 31 |         self.fill_gaps = fill_gaps
 32 |         self._output = Target.CLASSIFICATION if isinstance(predictor, ClassifierMixin) else \
 33 |             output if output is not None else Target.CONSTANT
 34 |         self.seed = seed
 35 |         self.ignore_dimensions = ignore_dimensions if ignore_dimensions is not None else []
 36 | 
 37 |     def make_fair(self, features: Iterable[str]):
 38 |         self.n_points = self._init_points
 39 |         self.ignore_dimensions += list(features)
 40 | 
 41 |     def _best_cube(self, dataframe: pd.DataFrame, cube: GenericCube, cubes: Iterable[Expansion]) -> Expansion | None:
 42 |         expansions = []
 43 |         for limit in cubes:
 44 |             count = limit.cube.count(dataframe)
 45 |             dataframe = pd.concat([dataframe, limit.cube.create_samples(self.min_examples - count)])
 46 |             limit.cube.update(dataframe, self.predictor)
 47 |             expansions.append(Expansion(
 48 |                 limit.cube, limit.feature, limit.direction,
 49 |                 abs(cube.output - limit.cube.output) if self._output is Target.CONSTANT else
 50 |                 1 - int(cube.output == limit.cube.output)
 51 |             ))
 52 |         if len(expansions) > 0:
 53 |             return sorted(expansions, key=lambda e: e.distance)[0]
 54 |         return None
 55 | 
 56 |     def _calculate_min_updates(self) -> Iterable[MinUpdate]:
 57 |         return [MinUpdate(name, (interval[1] - interval[0]) * self.min_update) for (name, interval) in
 58 |                 self._surrounding.dimensions.items()]
 59 | 
 60 |     def _create_range(self, cube: GenericCube, min_updates: Iterable[MinUpdate], feature: str, direction: str)\
 61 |             -> tuple[GenericCube, tuple[float, float]]:
 62 |         a, b = cube[feature]
 63 |         size = [min_update for min_update in min_updates if min_update.name == feature][0].value
 64 |         return (cube.copy(), (max(a - size, self._surrounding.get_first(feature)), a)
 65 |                 if direction == '-' else (b, min(b + size, self._surrounding.get_second(feature))))
 66 | 
 67 |     def _create_temp_cube(self, cube: GenericCube, min_updates: Iterable[MinUpdate],
 68 |                           hypercubes: Iterable[GenericCube], feature: str,
 69 |                           direction: str) -> Iterable[Expansion]:
 70 |         temp_cube, values = self._create_range(cube, min_updates, feature, direction)
 71 |         temp_cube.update_dimension(feature, values)
 72 |         overlap = temp_cube.overlap(hypercubes)
 73 |         while (overlap is not None) & (temp_cube.has_volume()):
 74 |             overlap = ITER._resolve_overlap(temp_cube, overlap, hypercubes, feature, direction)
 75 |         if (temp_cube.has_volume() & (overlap is None)) & (all(temp_cube != cube for cube in hypercubes)):
 76 |             yield Expansion(temp_cube, feature, direction)
 77 |         else:
 78 |             cube.add_limit(feature, direction)
 79 | 
 80 |     def _create_temp_cubes(self, cube: GenericCube, min_updates: Iterable[MinUpdate],
 81 |                            hypercubes: Iterable[GenericCube]) -> Iterable[Expansion]:
 82 |         tmp_cubes = []
 83 |         for feature in self._surrounding.dimensions.keys():
 84 |             if feature in self.ignore_dimensions:
 85 |                 continue
 86 |             limit = cube.check_limits(feature)
 87 |             if limit == '*':
 88 |                 continue
 89 |             for x in {'-', '+'} - {limit}:
 90 |                 tmp_cubes += self._create_temp_cube(cube, min_updates, hypercubes, feature, x)
 91 |         return tmp_cubes
 92 | 
 93 |     def _cubes_to_update(self, dataframe: pd.DataFrame, to_expand: Iterable[GenericCube],
 94 |                          hypercubes: Iterable[GenericCube], min_updates: Iterable[MinUpdate]) \
 95 |             -> Iterable[tuple[GenericCube, Expansion]]:
 96 |         results = [(hypercube, self._best_cube(dataframe, hypercube, self._create_temp_cubes(
 97 |             hypercube, min_updates, hypercubes))) for hypercube in to_expand]
 98 |         return sorted([result for result in results if result[1] is not None], key=lambda x: x[1].distance)
 99 | 
100 |     def _expand_or_create(self, cube: GenericCube, expansion: Expansion, hypercubes: Iterable[GenericCube]) -> None:
101 |         if expansion.distance > self.threshold:
102 |             hypercubes += [expansion.cube]
103 |         else:
104 |             cube.expand(expansion, hypercubes)
105 | 
106 |     @staticmethod
107 |     def _find_closer_sample(dataframe: pd.DataFrame, output: float | str) -> dict[str, float]:
108 |         if isinstance(output, str):
109 |             close_sample = dataframe[dataframe.iloc[:, -1] == output].iloc[0].to_dict()
110 |         else:
111 |             difference = abs(dataframe.iloc[:, -1] - output)
112 |             close_sample = dataframe[difference == min(difference)].iloc[0].to_dict()
113 |         return close_sample
114 | 
115 |     def _generate_starting_points(self, dataframe: pd.DataFrame) -> Iterable[GenericCube]:
116 |         if self.n_points <= 0:
117 |             raise (Exception('InvalidAttributeValueException'))
118 |         points: Iterable[float]
119 |         if isinstance(dataframe.iloc[0, -1], str):
120 |             classes = np.unique(dataframe.iloc[:, -1].values)
121 |             points = [classes[i] for i in range(min(self.n_points, len(classes)))]
122 |         else:
123 |             desc = dataframe.iloc[:, -1].describe()
124 |             min_output, max_output = desc["min"], desc["max"]
125 |             points = [(max_output - min_output) / 2] if self.n_points == 1 else \
126 |                 [min_output + (max_output - min_output) / (self.n_points - 1) * index for index in range(self.n_points)]
127 |         return [HyperCube.cube_from_point(ITER._find_closer_sample(dataframe, point), output=self._output)
128 |                 for point in points]
129 | 
130 |     def _initialize(self, dataframe: pd.DataFrame) -> Iterable[MinUpdate]:
131 |         self._fake_dataframe = dataframe.copy()
132 |         self._surrounding = HyperCube.create_surrounding_cube(dataframe, output=self._output)
133 |         min_updates = self._calculate_min_updates()
134 |         self._init_hypercubes(dataframe, min_updates)
135 |         for hypercube in self._hypercubes:
136 |             hypercube.update(dataframe, self.predictor)
137 |         return min_updates
138 | 
139 |     def _init_hypercubes(self, dataframe: pd.DataFrame, min_updates: Iterable[MinUpdate]):
140 |         while True:
141 |             hypercubes = self._generate_starting_points(dataframe)
142 |             for hypercube in hypercubes:
143 |                 hypercube.expand_all(min_updates, self._surrounding)
144 |                 for d in self.ignore_dimensions:
145 |                     hypercube[d] = self._surrounding[d]
146 |             self.n_points = self.n_points - 1
147 |             if not HyperCube.check_overlap(hypercubes, hypercubes):
148 |                 break
149 |         self._hypercubes = hypercubes
150 | 
151 |     def _iterate(self, dataframe: pd.DataFrame, hypercubes: Iterable[GenericCube], min_updates: Iterable[MinUpdate],
152 |                  left_iteration: int) -> int:
153 |         np.random.seed(self.seed)
154 |         iterations = 0
155 |         to_expand = [cube for cube in hypercubes if cube.limit_count < (len(dataframe.columns) - 1) * 2]
156 |         while (len(to_expand) > 0) and (iterations < left_iteration):
157 |             updates = list(self._cubes_to_update(dataframe, to_expand, hypercubes, min_updates))
158 |             if len(updates) > 0:
159 |                 self._expand_or_create(updates[0][0], updates[0][1], hypercubes)
160 |             iterations += 1
161 |             to_expand = [cube for cube in hypercubes if cube.limit_count < (len(dataframe.columns) - 1) * 2]
162 |         return iterations
163 | 
164 |     @staticmethod
165 |     def _resolve_overlap(cube: GenericCube, overlapping_cube: GenericCube, hypercubes: Iterable[GenericCube],
166 |                          feature: str, direction: str) -> GenericCube:
167 |         a, b = cube[feature]
168 |         cube.update_dimension(feature, max(overlapping_cube.get_second(feature), a) if direction == '-' else a,
169 |                               min(overlapping_cube.get_first(feature), b) if direction == '+' else b)
170 |         return cube.overlap(hypercubes)
171 | 
172 |     def _extract(self, dataframe: pd.DataFrame) -> Theory:
173 |         min_updates = self._initialize(dataframe)
174 |         temp_train = dataframe.copy()
175 |         fake = dataframe.copy()
176 |         iterations = 0
177 |         while temp_train.shape[0] > 0:
178 |             iterations += self._iterate(fake, self._hypercubes, min_updates, self.max_iterations - iterations)
179 |             if (iterations >= self.max_iterations) or (not self.fill_gaps):
180 |                 break
181 |             temp_train = temp_train.iloc[[p is None for p in self.predict(temp_train.iloc[:, :-1])]]
182 |             if temp_train.shape[0] > 0:
183 |                 point, ratio, overlap, new_cube = temp_train.iloc[0].to_dict(), 1.0, True, None
184 |                 temp_train = temp_train.drop([temp_train.index[0]])
185 |                 while overlap is not None:
186 |                     if new_cube is not None:
187 |                         if not new_cube.has_volume():
188 |                             break
189 |                     new_cube = HyperCube.cube_from_point(point, self._output)
190 |                     new_cube.expand_all(min_updates, self._surrounding, ratio)
191 |                     overlap = new_cube.overlap(self._hypercubes)
192 |                     ratio *= 2
193 |                 if new_cube.has_volume():
194 |                     self._hypercubes += [new_cube]
195 |         return self._create_theory(dataframe)
196 | 


--------------------------------------------------------------------------------
/psyke/extraction/hypercubic/strategy.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from functools import reduce
 4 | from collections import Iterable
 5 | 
 6 | 
 7 | class Strategy:
 8 |     def __init__(self, partitions = None):
 9 |         self._partitions = partitions
10 |         self._no_features = []
11 | 
12 |     def get(self, feature: str) -> int:
13 |         raise NotImplementedError
14 | 
15 |     def make_fair(self, features: Iterable[str]):
16 |         self._no_features = features
17 | 
18 |     def partition_number(self, features: Iterable[str]) -> int:
19 |         return reduce(lambda x, y: x * y, map(self.get, features), 1)
20 | 
21 |     def equals(self, strategy, features: Iterable[str]) -> bool:
22 |         eq = True
23 |         for f in features:
24 |             eq = eq and self.get(f) == strategy.get(f)
25 |         return eq
26 | 
27 |     def __str__(self):
28 |         return self._partitions
29 | 
30 |     def __repr__(self):
31 |         return self.__str__()
32 | 
33 | 
34 | class FixedStrategy(Strategy):
35 |     def __init__(self, partitions: int = 2):
36 |         super().__init__(partitions)
37 | 
38 |     def get(self, feature: str) -> int:
39 |         return 1 if feature in self._no_features else self._partitions
40 | 
41 |     def __str__(self):
42 |         return "Fixed ({})".format(super().__str__())
43 | 
44 | 
45 | class AdaptiveStrategy(Strategy):
46 |     def __init__(self, features: Iterable[(str, float)], partitions: Iterable[tuple[float, float]] | None = None):
47 |         super().__init__(partitions if partitions is not None else [(0.33, 2), (0.67, 3)])
48 |         self.features = features
49 | 
50 |     def get(self, feature: str) -> int:
51 |         if feature in self._no_features:
52 |             return 1
53 |         importance = next(filter(lambda t: t[0] == feature, self.features))[1]
54 |         n = 1
55 |         for (imp, part) in self._partitions:
56 |             if importance >= imp:
57 |                 n = part
58 |             else:
59 |                 break
60 |         return n
61 | 
62 |     def __str__(self):
63 |         return "Adaptive ({})".format(super().__str__())
64 | 


--------------------------------------------------------------------------------
/psyke/extraction/hypercubic/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | import math
 3 | import warnings
 4 | 
 5 | warnings.simplefilter("ignore")
 6 | 
 7 | Dimension = tuple[float, float]
 8 | Dimensions = dict[str, Dimension]
 9 | 
10 | 
11 | class Expansion:
12 | 
13 |     def __init__(self, cube, feature: str, direction: str, distance: float = math.nan):
14 |         self.cube = cube
15 |         self.feature = feature
16 |         self.direction = direction
17 |         self.distance = distance
18 | 
19 |     def __getitem__(self, index: int) -> float:
20 |         return self.cube[self.feature][index]
21 | 
22 |     def boundaries(self, a: float, b: float) -> (float, float):
23 |         return (self[0], b) if self.direction == '-' else (a, self[1])
24 | 
25 | 
26 | class Limit:
27 | 
28 |     def __init__(self, feature: str, direction: str):
29 |         self.feature = feature
30 |         self.direction = direction
31 | 
32 |     def __eq__(self, other):
33 |         return (self.feature == other.feature) and (self.direction == other.direction)
34 | 
35 |     def __hash__(self):
36 |         return hash(self.feature + self.direction)
37 | 
38 | 
39 | class MinUpdate:
40 | 
41 |     def __init__(self, name: str, value: float):
42 |         self.name = name
43 |         self.value = value
44 | 
45 | 
46 | class ZippedDimension:
47 | 
48 |     def __init__(self, name: str, this_dimension: Dimension, other_dimension: Dimension):
49 |         self.name = name
50 |         self.this_dimension = this_dimension
51 |         self.other_dimension = other_dimension
52 | 
53 |     def __eq__(self, other: ZippedDimension) -> bool:
54 |         return (self.name == other.name) and (self.this_dimension == other.this_dimension) and \
55 |                (self.other_dimension == other.other_dimension)
56 | 
57 | 


--------------------------------------------------------------------------------
/psyke/extraction/real/__init__.py:
--------------------------------------------------------------------------------
  1 | from functools import lru_cache
  2 | from psyke.extraction import PedagogicalExtractor
  3 | from psyke.extraction.real.utils import Rule, IndexedRuleSet
  4 | from psyke.schema import DiscreteFeature
  5 | from psyke.utils.dataframe import HashableDataFrame
  6 | from psyke.utils.logic import create_term, create_head, create_variable_list
  7 | from tuprolog.core import Var, Struct, Clause, clause
  8 | from tuprolog.theory import MutableTheory, mutable_theory, Theory
  9 | from typing import Iterable
 10 | import pandas as pd
 11 | import numpy as np
 12 | 
 13 | 
 14 | class REAL(PedagogicalExtractor):
 15 |     """
 16 |     Explanator implementing Rule Extraction As Learning (REAL) algorithm, doi:10.1016/B978-1-55860-335-6.50013-1.
 17 |     The algorithm is sensible to features' order in the provided dataset during extraction.
 18 |     """
 19 | 
 20 |     def __init__(self, predictor, discretization: Iterable[DiscreteFeature]):
 21 |         super().__init__(predictor, discretization)
 22 |         self._ignore_feature = []
 23 |         self._ruleset: IndexedRuleSet = IndexedRuleSet()
 24 | 
 25 |     @property
 26 |     def n_rules(self):
 27 |         return len(self._ruleset.flatten())
 28 | 
 29 |     def _covers(self, sample: pd.Series, rules: list[Rule]) -> bool:
 30 |         new_rule = self._rule_from_example(sample)
 31 |         return any([new_rule in rule for rule in rules])
 32 | 
 33 |     def _body(self, variables: dict[str, Var], rule: Rule) -> list[Struct]:
 34 |         result = []
 35 |         for predicates, truth_value in zip(rule.to_lists(), [True, False]):
 36 |             for predicate in predicates:
 37 |                 feature = [feature for feature in self.discretization if predicate in feature.admissible_values][0]
 38 |                 result.append(create_term(variables[feature.name], feature.admissible_values[predicate], truth_value))
 39 |         return result
 40 | 
 41 |     def _create_clause(self, dataset: pd.DataFrame, variables: dict[str, Var], key: int, rule: Rule) -> Clause:
 42 |         return clause(create_head(dataset.columns[-1], list(variables.values()), key), self._body(variables, rule))
 43 | 
 44 |     def _create_new_rule(self, sample: pd.Series) -> Rule:
 45 |         rule = self._rule_from_example(sample)
 46 |         return self._generalise(rule, sample)
 47 | 
 48 |     def _create_ruleset(self, dataset: pd.DataFrame) -> IndexedRuleSet:
 49 |         ruleset = IndexedRuleSet.create_indexed_ruleset(sorted(set(dataset.iloc[:, -1])))
 50 |         for _, sample in dataset.iloc[:, :-1].iterrows():
 51 |             prediction = list(self.predictor.predict(sample.to_frame().transpose()))[0]
 52 |             rules = ruleset.get(prediction)
 53 |             if not self._covers(sample, rules):
 54 |                 rules.append(self._create_new_rule(sample))
 55 |         return ruleset.optimize()
 56 | 
 57 |     def _create_theory(self, dataset: pd.DataFrame) -> MutableTheory:
 58 |         theory = mutable_theory()
 59 |         for key, rule in self._ruleset.flatten():
 60 |             variables = create_variable_list(self.discretization)
 61 |             theory.assertZ(self._create_clause(dataset, variables, key, rule))
 62 |         return theory
 63 | 
 64 |     def _generalise(self, rule: Rule, sample: pd.Series) -> Rule:
 65 |         mutable_rule = rule.to_lists()
 66 |         samples = sample.to_frame().transpose()
 67 |         for predicate in rule.true_predicates:
 68 |             samples = self._remove_antecedent(samples.copy(), predicate, mutable_rule)
 69 |         return Rule(mutable_rule[0], mutable_rule[1]).reduce(self.discretization)
 70 | 
 71 |     def _remove_antecedent(self, samples: pd.DataFrame, predicate: str, rule: list[list[str]]) -> (pd.DataFrame, bool):
 72 |         feature = [feature for feature in self.discretization if predicate in feature.admissible_values][0]
 73 |         output = np.array(self.predictor.predict(samples))
 74 |         copies = [samples.copy()]
 75 |         samples[predicate] = 0
 76 |         for f in [f for f in feature.admissible_values if f != predicate]:
 77 |             copy = samples.copy()
 78 |             copy[f] = 1
 79 |             if all(output == np.array(self.predictor.predict(copy))):
 80 |                 copies.append(copy)
 81 |                 rule[1].remove(f)
 82 |         if len(copies) > 1:
 83 |             rule[0].remove(predicate)
 84 |         return pd.concat([df for df in copies], ignore_index=True)
 85 | 
 86 |     @lru_cache(maxsize=512)
 87 |     def _get_or_set(self, dataset: HashableDataFrame) -> IndexedRuleSet:
 88 |         return self._create_ruleset(dataset)
 89 | 
 90 |     def _internal_predict(self, sample: pd.Series):
 91 |         x = [index for index, rule in self._ruleset.flatten() if self._rule_from_example(sample) in rule]
 92 |         return x[0] if x else None
 93 | 
 94 |     def make_fair(self, features: Iterable[str]):
 95 |         self._ignore_feature = [list(i.admissible_values.keys()) for i in self.discretization if i.name in features] \
 96 |             if self.discretization else [features]
 97 |         self._ignore_feature = [feature for features in self._ignore_feature for feature in features]
 98 |         self._get_or_set.cache_clear()
 99 | 
100 |     def _rule_from_example(self, sample: pd.Series) -> Rule:
101 |         true_predicates, false_predicates = [], []
102 |         for feature, value in sample.items():
103 |             if feature in self._ignore_feature:
104 |                 continue
105 |             true_predicates.append(str(feature)) if value == 1 else false_predicates.append(str(feature))
106 |         return Rule(true_predicates, false_predicates)
107 | 
108 |     def _subset(self, samples: pd.DataFrame, predicate: str) -> (pd.DataFrame, bool):
109 |         samples_0 = samples.copy()
110 |         samples_0[predicate].values[:] = 0
111 |         samples_1 = samples.copy()
112 |         samples_1[predicate].values[:] = 1
113 |         samples_all = samples_0.append(samples_1)
114 |         return samples_all, len(set(self.predictor.predict(samples_all))) == 1
115 | 
116 |     def _extract(self, dataframe: pd.DataFrame) -> Theory:
117 |         self._ruleset = self._get_or_set(HashableDataFrame(dataframe))
118 |         return self._create_theory(dataframe)
119 | 
120 |     def _predict(self, dataframe) -> Iterable:
121 |         return np.array([self._internal_predict(data.transpose()) for _, data in dataframe.iterrows()])
122 | 


--------------------------------------------------------------------------------
/psyke/extraction/real/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | from psyke import DiscreteFeature
 3 | from typing import Iterable
 4 | import pandas as pd
 5 | 
 6 | 
 7 | class Rule:
 8 | 
 9 |     def __init__(self, true_predicates: list[str], false_predicates: list[str]):
10 |         self.true_predicates = true_predicates
11 |         self.false_predicates = false_predicates
12 | 
13 |     def __contains__(self, other: Rule) -> bool:
14 |         return all([predicate in other.true_predicates for predicate in self.true_predicates]) and\
15 |                all([predicate in other.false_predicates for predicate in self.false_predicates])
16 | 
17 |     def __eq__(self, other: Rule) -> bool:
18 |         return self.true_predicates == other.true_predicates and self.false_predicates == other.false_predicates
19 | 
20 |     def __hash__(self) -> int:
21 |         return hash(self.true_predicates) + hash(self.false_predicates)
22 | 
23 |     def reduce(self, features: Iterable[DiscreteFeature]) -> Rule:
24 |         to_be_removed = [item for tp in self.true_predicates
25 |                          for feature in features if tp in feature.admissible_values
26 |                          for item in feature.admissible_values.keys()]
27 |         return Rule(self.true_predicates, [fp for fp in self.false_predicates if fp not in to_be_removed])
28 | 
29 |     def to_lists(self) -> list[list[str]]:
30 |         return [self.true_predicates.copy(), self.false_predicates.copy()]
31 | 
32 | 
33 | class IndexedRuleSet(dict[int, list[Rule]]):
34 | 
35 |     def flatten(self) -> list[tuple[int, Rule]]:
36 |         return [(key, value) for key, values in self.items() for value in values]
37 | 
38 |     def optimize(self) -> IndexedRuleSet:
39 |         useless_rules = [item for key, entry in self.items() for item in IndexedRuleSet._useless_rules(key, entry)]
40 |         for rule in useless_rules:
41 |             self[rule[0]].remove(rule[1])
42 |         return self
43 | 
44 |     @staticmethod
45 |     def _useless_rules(key, rules: list[Rule]) -> list[(int, Rule)]:
46 |         return [
47 |             (key, rule) for rule in rules
48 |             if any(rule in other_rule for other_rule in rules if other_rule != rule)
49 |         ]
50 | 
51 |     @staticmethod
52 |     def create_indexed_ruleset(indices: Iterable) -> IndexedRuleSet:
53 |         return IndexedRuleSet({i: [] for i in indices})
54 | 


--------------------------------------------------------------------------------
/psyke/extraction/trepan/__init__.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from psyke.extraction import PedagogicalExtractor
  3 | from psyke.extraction.trepan.utils import Node, Split, SplitLogic
  4 | from psyke import DiscreteFeature
  5 | from psyke.utils.logic import create_term, create_variable_list, create_head
  6 | from psyke.utils.sorted import SortedList
  7 | from tuprolog.core import Var, Struct, clause
  8 | from tuprolog.theory import MutableTheory, mutable_theory, Theory
  9 | from typing import Iterable, Union, Any
 10 | import pandas as pd
 11 | 
 12 | 
 13 | class Trepan(PedagogicalExtractor):
 14 | 
 15 |     def __init__(self, predictor, discretization: Iterable[DiscreteFeature], min_examples: int = 0, max_depth: int = 3,
 16 |                  split_logic: SplitLogic = SplitLogic.DEFAULT):
 17 |         super().__init__(predictor, discretization)
 18 |         self._ignore_feature = []
 19 |         self.min_examples = min_examples
 20 |         self.max_depth = max_depth
 21 |         self.split_logic = split_logic
 22 |         self._root: Node
 23 | 
 24 |     def make_fair(self, features: Iterable[str]):
 25 |         self._ignore_feature = [list(i.admissible_values.keys()) for i in self.discretization if i.name in features] \
 26 |             if self.discretization else [features]
 27 |         self._ignore_feature = [feature for features in self._ignore_feature for feature in features]
 28 | 
 29 |     @property
 30 |     def n_rules(self):
 31 |         return sum(1 for _ in self._root)
 32 | 
 33 |     def _best_split(self, node: Node, names: Iterable[str]) -> Union[tuple[Node, Node], None]:
 34 |         if node.samples.shape[0] < self.min_examples:
 35 |             raise NotImplementedError()
 36 |         if node.n_classes == 1:
 37 |             return None
 38 |         splits = self._create_splits(node, names)
 39 |         return None if len(splits) == 0 or splits[0].children[0].depth > self.max_depth else splits[0].children
 40 | 
 41 |     def _compact(self):
 42 |         nodes = [self._root]
 43 |         while len(nodes) > 0:
 44 |             node = nodes.pop()
 45 |             for item in self._nodes_to_remove(node, nodes):
 46 |                 node.children.remove(item)
 47 |                 node.children += item.children
 48 | 
 49 |     def _create_body(self, variables: dict[str, Var], node: Node) -> Iterable[Struct]:
 50 |         result = []
 51 |         for constraint, value in node.constraints:
 52 |             feature: DiscreteFeature = [d for d in self.discretization if constraint in d.admissible_values][0]
 53 |             result.append(create_term(variables[feature.name], feature.admissible_values[constraint], value == 1.0))
 54 |         return result
 55 | 
 56 |     @staticmethod
 57 |     def _create_samples(node: Node, column: str, value: float) -> pd.DataFrame:
 58 |         return node.samples.loc[node.samples[column] == value]
 59 | 
 60 |     @staticmethod
 61 |     def _create_split(node: Node, column: str) -> Union[Split, None]:
 62 |         true_examples = Trepan._create_samples(node, column, 1.0)
 63 |         false_examples = Trepan._create_samples(node, column, 0.0)
 64 |         true_constraints = list(node.constraints) + [(column, 1.0)]
 65 |         false_constraints = list(node.constraints) + [(column, 0.0)]
 66 |         true_node = Node(true_examples, node.n_examples, true_constraints, depth=node.depth + 1) \
 67 |             if true_examples.shape[0] > 0 else None
 68 |         false_node = Node(false_examples, node.n_examples, false_constraints, depth=node.depth + 1) \
 69 |             if false_examples.shape[0] > 0 else None
 70 |         return None if true_node is None or false_node is None else Split(node, (true_node, false_node))
 71 | 
 72 |     def _create_splits(self, node: Node, names: Iterable[str]) -> SortedList[Split]:
 73 |         splits, constraints = Trepan._init_splits(node)
 74 |         for column in [column for column in names if column not in list(constraints) + self._ignore_feature]:
 75 |             split = Trepan._create_split(node, column)
 76 |             if split is not None:
 77 |                 splits.add(split)
 78 |         return splits
 79 | 
 80 |     def _create_theory(self, name: str) -> MutableTheory:
 81 |         theory = mutable_theory()
 82 |         for node in self._root:
 83 |             variables = create_variable_list(self.discretization)
 84 |             theory.assertZ(
 85 |                 clause(
 86 |                     create_head(name, list(variables.values()), str(node.dominant)),
 87 |                     self._create_body(variables, node)
 88 |                 )
 89 |             )
 90 |         return theory
 91 | 
 92 |     def _init(self, dateset: pd.DataFrame) -> SortedList[Node]:
 93 |         self._root = Node(dateset, dateset.shape[0])
 94 |         queue: SortedList[Node] = SortedList(lambda x, y: int(x.priority - y.priority))
 95 |         queue.add(self._root)
 96 |         return queue
 97 | 
 98 |     @staticmethod
 99 |     def _init_splits(node: Node) -> tuple[SortedList[Split], Iterable[str]]:
100 |         return SortedList(lambda x, y: int(x.priority - y.priority)),\
101 |                set(constraint[0] for constraint in node.constraints)
102 | 
103 |     @staticmethod
104 |     def _nodes_to_remove(node: Node, nodes: list[Node]) -> list[Node]:
105 |         to_remove = []
106 |         for child in node.children:
107 |             if node.dominant == child.dominant and len(child.children) == 1:
108 |                 to_remove.append(child)
109 |                 nodes.append(node)
110 |             else:
111 |                 nodes.append(child)
112 |         return to_remove
113 | 
114 |     @staticmethod
115 |     def _internal_predict(x: pd.Series, node: Node, categories: Iterable) -> Any:
116 |         for child in node.children:
117 |             skip = False
118 |             for constraint, value in child.constraints:
119 |                 if x[constraint] != value:
120 |                     skip = True
121 |                     continue
122 |             if not skip:
123 |                 return Trepan._internal_predict(x, child, categories)
124 |         return node.dominant
125 | 
126 |     def _optimize(self) -> None:
127 |         n, nodes = 0, [self._root]
128 |         while len(nodes) > 0:
129 |             n += Trepan._remove_nodes(nodes)
130 |         self._compact() if n == 0 else self._optimize()
131 | 
132 |     @staticmethod
133 |     def _remove_nodes(nodes: list[Node]) -> int:
134 |         node = nodes.pop()
135 |         to_remove = [child for child in node.children if len(child.children) == 0 and node.dominant == child.dominant]
136 |         for child in to_remove:
137 |             node.children.remove(child)
138 |         for child in node.children:
139 |             if len(child.children) > 0:
140 |                 nodes.append(child)
141 |         return len(to_remove)
142 | 
143 |     def _extract(self, dataframe: pd.DataFrame) -> Theory:
144 |         queue = self._init(dataframe)
145 |         while len(queue) > 0:
146 |             node = queue.pop()
147 |             if self.split_logic == SplitLogic.DEFAULT:
148 |                 best: Union[tuple[Node, Node], None] = self._best_split(node, dataframe.columns[:-1])
149 |                 if best is None:
150 |                     continue
151 |             else:
152 |                 raise Exception('Illegal split logic')
153 |             queue.add_all(best)
154 |             node.children += list(best)
155 |         self._optimize()
156 |         return self._create_theory(dataframe.columns[-1])
157 | 
158 |     def _predict(self, dataframe: pd.DataFrame) -> Iterable:
159 |         return np.array(
160 |             [Trepan._internal_predict(sample, self._root, dataframe.columns[-1]) for _, sample in dataframe.iterrows()]
161 |         )
162 | 


--------------------------------------------------------------------------------
/psyke/extraction/trepan/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | from itertools import chain
 3 | from typing import Iterable, Any
 4 | import pandas as pd
 5 | 
 6 | 
 7 | class Node:
 8 | 
 9 |     def __init__(self, samples: pd.DataFrame, n_examples: int, constraints: Iterable[tuple[str, float]] = None,
10 |                  children: list[Node] = None, depth: int = 0):
11 |         self.samples = samples
12 |         self.n_examples = n_examples
13 |         self.constraints = [] if constraints is None else constraints
14 |         self.children = [] if children is None else children
15 |         self.depth = depth
16 | 
17 |     def __str__(self):
18 |         name = ''.join(('' if c[1] > 0 else '!') + c[0] + ', ' for c in self.constraints)
19 |         return name[:-2] + ' = ' + str(self.dominant)
20 | 
21 |     @property
22 |     def priority(self) -> float:
23 |         return -(self.reach * (1 - self.fidelity))
24 | 
25 |     @property
26 |     def fidelity(self) -> float:
27 |         return 1.0 * self.correct / (self.samples.shape[0] if self.samples.shape[0] > 0 else 1)
28 | 
29 |     @property
30 |     def reach(self) -> float:
31 |         return 1.0 * self.samples.shape[0] / self.n_examples
32 | 
33 |     @property
34 |     def correct(self) -> float:
35 |         return sum(self.samples.iloc[:, -1] == self.dominant)
36 | 
37 |     @property
38 |     def dominant(self) -> Any:
39 |         return self.samples.iloc[:, -1].mode()[0] if self.samples.shape[0] > 0 else ''
40 | 
41 |     @property
42 |     def n_classes(self) -> int:
43 |         return len(set(self.samples.iloc[:, -1]))
44 | 
45 |     def __iter__(self) -> Iterable[Node]:
46 |         for child in chain(*map(iter, self.children)):
47 |             yield child
48 |         yield self
49 | 
50 | 
51 | class Split:
52 | 
53 |     # TODO: should be configurable by user
54 |     PRIORITY_BONUS: int = 100
55 |     PRIORITY_PENALTY: int = 200
56 | 
57 |     def __init__(self, parent: Node, children: tuple[Node, Node]):
58 |         self.parent = parent
59 |         self.children = children
60 | 
61 |     @property
62 |     def priority(self) -> float:
63 |         return self.__priority(self.parent)
64 | 
65 |     def __priority(self, parent: Node) -> float:
66 |         true_node, false_node = self.children
67 |         priority = - (true_node.fidelity + false_node.fidelity)
68 |         for node in [true_node, false_node]:
69 |             priority -= self.PRIORITY_BONUS if parent.n_classes > node.n_classes else 0
70 |         priority += self.PRIORITY_PENALTY if true_node.dominant == false_node.dominant else 0
71 |         return priority
72 | 
73 | 
74 | class SplitLogic:
75 | 
76 |     DEFAULT = 1
77 | 


--------------------------------------------------------------------------------
/psyke/hypercubepredictor.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import Iterable
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from sklearn.neighbors import BallTree
  8 | 
  9 | from psyke import EvaluableModel, Target, get_int_precision
 10 | from psyke.extraction.hypercubic import RegressionCube, GenericCube, Point
 11 | 
 12 | 
 13 | class HyperCubePredictor(EvaluableModel):
 14 |     def __init__(self, output=Target.CONSTANT, discretization=None, normalization=None):
 15 |         super().__init__(discretization, normalization)
 16 |         self._hypercubes = []
 17 |         self._dimensions_to_ignore = set()
 18 |         self._output = output
 19 |         self._surrounding = None
 20 | 
 21 |     def _predict(self, dataframe: pd.DataFrame) -> Iterable:
 22 |         return np.array([self._predict_from_cubes(row.to_dict()) for _, row in dataframe.iterrows()])
 23 | 
 24 |     def _brute_predict(self, dataframe: pd.DataFrame, criterion: str = 'corner', n: int = 2) -> Iterable:
 25 |         predictions = np.array(self._predict(dataframe))
 26 |         idx = [prediction is None for prediction in predictions]
 27 |         if sum(idx) > 0:
 28 |             if criterion == 'default':
 29 |                 predictions[idx] = np.array([HyperCubePredictor._get_cube_output(
 30 |                     self._surrounding, row
 31 |                 ) for _, row in dataframe[idx].iterrows()])
 32 |             elif criterion == 'surface':
 33 |                 predictions[idx] = np.array([HyperCubePredictor._get_cube_output(self._brute_predict_surface(row), row)
 34 |                                              for _, row in dataframe[idx].iterrows()])
 35 |             else:
 36 |                 tree, cubes = self._create_brute_tree(criterion, n)
 37 |                 predictions[idx] = np.array([HyperCubePredictor._brute_predict_from_cubes(
 38 |                     row.to_dict(), tree, cubes
 39 |                 ) for _, row in dataframe[idx].iterrows()])
 40 |         return np.array(predictions)
 41 | 
 42 |     @staticmethod
 43 |     def _brute_predict_from_cubes(row: dict[str, float], tree: BallTree,
 44 |                                   cubes: list[GenericCube]) -> float | str:
 45 |         idx = tree.query([list(row.values())], k=1)[1][0][0]
 46 |         return HyperCubePredictor._get_cube_output(cubes[idx], row)
 47 | 
 48 |     def _brute_predict_surface(self, row: pd.Series) -> GenericCube:
 49 |         return min([(
 50 |             cube.surface_distance(Point(list(row.keys()), list(row.values))), cube.volume(), cube
 51 |         ) for cube in self._hypercubes])[-1]
 52 | 
 53 |     def _create_brute_tree(self, criterion: str = 'center', n: int = 2) -> (BallTree, list[GenericCube]):
 54 |         admissible_criteria = ['surface', 'center', 'corner', 'perimeter', 'density', 'default']
 55 |         if criterion not in admissible_criteria:
 56 |             raise NotImplementedError(
 57 |                 "'criterion' should be chosen in " + str(admissible_criteria)
 58 |             )
 59 | 
 60 |         points = [(cube.center, cube) for cube in self._hypercubes] if criterion == 'center' else \
 61 |             [(cube.barycenter, cube) for cube in self._hypercubes] if criterion == 'density' else \
 62 |             [(corner, cube) for cube in self._hypercubes for corner in cube.corners()] if criterion == 'corner' else \
 63 |             [(point, cube) for cube in self._hypercubes for point in cube.perimeter_samples(n)] \
 64 |             if criterion == 'perimeter' else None
 65 | 
 66 |         return BallTree(pd.concat([point[0].to_dataframe() for point in points], ignore_index=True)), \
 67 |             [point[1] for point in points]
 68 | 
 69 |     def _predict_from_cubes(self, data: dict[str, float]) -> float | str | None:
 70 |         cube = self._find_cube(data)
 71 |         if cube is None:
 72 |             return None
 73 |         elif self._output == Target.CLASSIFICATION:
 74 |             return HyperCubePredictor._get_cube_output(cube, data)
 75 |         else:
 76 |             return round(HyperCubePredictor._get_cube_output(cube, data), get_int_precision())
 77 | 
 78 |     def _find_cube(self, data: dict[str, float]) -> GenericCube | None:
 79 |         if not self._hypercubes:
 80 |             return None
 81 |         data = data.copy()
 82 |         for dimension in self._dimensions_to_ignore:
 83 |             if dimension in data:
 84 |                 del data[dimension]
 85 |         for cube in self._hypercubes:
 86 |             if data in cube:
 87 |                 return cube.copy()
 88 |         if self._hypercubes[-1].is_default:
 89 |             return self._hypercubes[-1].copy()
 90 | 
 91 |     @property
 92 |     def n_rules(self):
 93 |         return len(list(self._hypercubes))
 94 | 
 95 |     @property
 96 |     def volume(self):
 97 |         return sum([cube.volume() for cube in self._hypercubes])
 98 | 
 99 |     @staticmethod
100 |     def _get_cube_output(cube, data: dict[str, float]) -> float:
101 |         return cube.output.predict(pd.DataFrame([data])).flatten()[0] if \
102 |             isinstance(cube, RegressionCube) else cube.output
103 | 


--------------------------------------------------------------------------------
/psyke/tuning/__init__.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from enum import Enum
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from psyke.extraction.hypercubic import Grid
 7 | from psyke.utils import Target
 8 | 
 9 | 
10 | class Objective(Enum):
11 |     MODEL = 1,
12 |     DATA = 2
13 | 
14 | 
15 | class Optimizer:
16 |     def __init__(self, dataframe: pd.DataFrame, output: Target = Target.CONSTANT, max_error_increase: float = 1.2,
17 |                  min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, patience: int = 5,
18 |                  normalization=None, discretization=None):
19 |         self.dataframe = dataframe
20 |         self.output = output
21 |         self.max_error_increase = max_error_increase
22 |         self.min_rule_decrease = min_rule_decrease
23 |         self.readability_tradeoff = readability_tradeoff
24 |         self.patience = patience
25 |         self.params = None
26 |         self.normalization = normalization
27 |         self.discretization = discretization
28 | 
29 |     def search(self):
30 |         raise NotImplementedError
31 | 
32 |     def _best(self, params):
33 |         param_dict = {self._score(t): t for t in params}
34 |         min_param = min(param_dict)
35 |         return min_param, param_dict[min_param]
36 | 
37 |     def _score(self, param):
38 |         return param[0] * np.ceil(param[1] * self.readability_tradeoff)
39 | 
40 |     def _best_param(self, param):
41 |         param_dict = {t[param]: t for t in self.params}
42 |         min_param = min(param_dict)
43 |         return min_param, param_dict[min_param]
44 | 
45 |     def get_best(self):
46 |         names = ["Combined", "Predictive loss", "N rules"]
47 |         params = [self._best(self.params), self._best_param(0), self._best_param(1)]
48 |         for n, p in zip(names, params):
49 |             self._print_params(n, p[1])
50 |             print()
51 |         return self._best(self.params)[1], self._best_param(0)[1], self._best_param(1)[1]
52 | 
53 |     def _print_params(self, n, param):
54 |         raise NotImplementedError
55 | 
56 | 
57 | class SKEOptimizer(Optimizer, ABC):
58 |     def __init__(self, predictor, dataframe: pd.DataFrame, max_error_increase: float = 1.2,
59 |                  min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, patience: int = 5,
60 |                  objective: Objective = Objective.MODEL, output: Target = Target.CONSTANT,
61 |                  normalization=None, discretization=None):
62 |         super().__init__(dataframe, output, max_error_increase, min_rule_decrease, readability_tradeoff,
63 |                          patience, normalization, discretization)
64 |         self.predictor = predictor
65 |         self.objective = objective
66 | 
67 | 
68 | class IterativeOptimizer(Optimizer, ABC):
69 |     def __init__(self, dataframe: pd.DataFrame, max_error_increase: float = 1.2,
70 |                  min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, max_depth: int = 10,
71 |                  patience: int = 5, output: Target = Target.CONSTANT, normalization=None, discretization=None):
72 |         super().__init__(dataframe, output, max_error_increase, min_rule_decrease, readability_tradeoff,
73 |                          patience, normalization, discretization)
74 |         self.max_depth = max_depth
75 | 
76 |     def _iteration_improvement(self, best, other):
77 |         if other[0] == best[0]:
78 |             return (best[1] - other[1]) * 2
79 |         return 1 / (
80 |                 (1 - other[0] / best[0]) ** self.readability_tradeoff *
81 |                 np.ceil(other[1] / self.readability_tradeoff) / np.ceil(best[1] / self.readability_tradeoff)
82 |         )
83 | 
84 |     def _check_iteration_improvement(self, best, current):
85 |         improvement = \
86 |             self._iteration_improvement([best[0], best[1]], [current[0], current[1]]) if best is not None else np.inf
87 |         if isinstance(improvement, complex):
88 |             improvement = 1.0
89 |         return current, improvement < 1.2
90 | 


--------------------------------------------------------------------------------
/psyke/tuning/crash/__init__.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from psyke.tuning import Objective, SKEOptimizer
 6 | from psyke.tuning.orchid import OrCHiD
 7 | from psyke.utils import Target
 8 | 
 9 | 
10 | class CRASH(SKEOptimizer):
11 |     class Algorithm(Enum):
12 |         ExACT = 1,
13 |         CREAM = 2
14 | 
15 |     def __init__(self, predictor, dataframe: pd.DataFrame, max_error_increase: float = 1.2,
16 |                  min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, max_depth: int = 10,
17 |                  max_gauss_components: int = 5, patience: int = 5, output: Target = Target.CONSTANT,
18 |                  objective: Objective = Objective.MODEL, normalization=None, discretization=None):
19 |         super().__init__(predictor, dataframe, max_error_increase, min_rule_decrease, readability_tradeoff,
20 |                          patience, objective, output, normalization, discretization)
21 |         self.max_depth = max_depth
22 |         self.max_gauss_components = max_gauss_components
23 | 
24 |     def search(self):
25 |         self.params = []
26 |         for algorithm in [OrCHiD.Algorithm.ExACT, OrCHiD.Algorithm.CREAM]:
27 |             self.params += self.__search_algorithm(algorithm)
28 | 
29 |     def __search_algorithm(self, algorithm):
30 |         params = []
31 |         best = None
32 | 
33 |         for gauss_components in range(2, self.max_gauss_components + 1):
34 |             data = self.dataframe.sample(n=gauss_components * 100) if gauss_components * 100 < len(self.dataframe) \
35 |                 else self.dataframe
36 |             current_params = self.__search_components(data, algorithm, gauss_components)
37 |             current_best = self._best(current_params)[1]
38 |             if best is not None and self._score(best) <= self._score(current_best):
39 |                 break
40 |             best = current_best
41 |             params += current_params
42 | 
43 |         return params
44 | 
45 |     def __search_components(self, data, algorithm, gauss_components):
46 |         orchid = OrCHiD(data, algorithm, self.output, self.max_error_increase, self.min_rule_decrease,
47 |                         self.readability_tradeoff, self.patience, self.max_depth, gauss_components,
48 |                         self.normalization, self.discretization)
49 |         orchid.search()
50 |         return [(*p, gauss_components, algorithm) for p in orchid.params]
51 | 
52 |     def _print_params(self, name, params):
53 |         print("*****************************")
54 |         print(f"Best {name}")
55 |         print("*****************************")
56 |         print(f"MAE = {params[0]:.2f}, {params[1]} rules")
57 |         print(f"Algorithm = {params[5]}")
58 |         print(f"Threshold = {params[3]:.2f}")
59 |         print(f"Depth = {params[2]}")
60 |         print(f"Gaussian components = {params[4]}")
61 | 


--------------------------------------------------------------------------------
/psyke/tuning/orchid/__init__.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from psyke import Clustering, EvaluableModel
 7 | from psyke.tuning import Optimizer, IterativeOptimizer
 8 | from psyke.utils import Target
 9 | 
10 | 
11 | class OrCHiD(IterativeOptimizer):
12 |     class Algorithm(Enum):
13 |         ExACT = 1,
14 |         CREAM = 2
15 | 
16 |     def __init__(self, dataframe: pd.DataFrame, algorithm, output: Target = Target.CONSTANT,
17 |                  max_error_increase: float = 1.2, min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1,
18 |                  patience: int = 5, max_depth: int = 10, gauss_components=10, normalization=None, discretization=None):
19 |         super().__init__(dataframe, max_error_increase, min_rule_decrease, readability_tradeoff, max_depth, patience,
20 |                          output, normalization, discretization)
21 |         self.algorithm = algorithm
22 |         self.gauss_components = gauss_components
23 | 
24 |     def search(self):
25 |         self.params = self.__search_depth()
26 | 
27 |     def __search_depth(self):
28 |         params, best = [], None
29 | 
30 |         for depth in range(1, self.max_depth + 1):
31 |             current_params = self.__search_threshold(depth)
32 |             current_best = self._best(current_params)[1]
33 |             print()
34 |             best, to_break = self._check_iteration_improvement(best, current_best)
35 |             params += current_params
36 | 
37 |             if len(params) > 1 and to_break:
38 |                 break
39 |         return params
40 | 
41 |     def __search_threshold(self, depth):
42 |         step = 1.0
43 |         threshold = 1.0
44 |         params = []
45 |         patience = self.patience
46 |         while patience > 0:
47 |             print(f"{self.algorithm}. Depth: {depth}. Threshold = {threshold:.2f}. "
48 |                   f"Gaussian components = {self.gauss_components}. ", end="")
49 |             clustering = (Clustering.cream if self.algorithm == OrCHiD.Algorithm.CREAM else Clustering.exact)(
50 |                 depth=depth, error_threshold=threshold, gauss_components=self.gauss_components, output=self.output
51 |             )
52 |             clustering.fit(self.dataframe)
53 |             task, metric = \
54 |                 (EvaluableModel.Task.CLASSIFICATION, EvaluableModel.ClassificationScore.INVERSE_ACCURACY) \
55 |                 if self.output == Target.CLASSIFICATION else \
56 |                 (EvaluableModel.Task.REGRESSION, EvaluableModel.RegressionScore.MAE)
57 |             p, n = clustering.score(self.dataframe, None, False, False, task=task,
58 |                                     scoring_function=[metric])[metric][0], clustering.n_rules
59 | 
60 |             print(f"Predictive loss = {p:.2f}, {n} rules")
61 | 
62 |             if len(params) == 0:
63 |                 params.append((p, n, depth, threshold))
64 |                 threshold = p / 20
65 |                 step = p / self.patience * 0.75
66 |                 continue
67 | 
68 |             if (n == 1) or (p == 0.0):
69 |                 params.append((p, n, depth, threshold))
70 |                 break
71 | 
72 |             if p > params[0][0] * self.max_error_increase:
73 |                 break
74 | 
75 |             improvement = (params[-1][0] / p) + (1 - n / params[-1][1])
76 | 
77 |             if improvement <= 1 or n > np.ceil(params[-1][1] * self.min_rule_decrease):
78 |                 patience -= 1
79 |             if p != params[-1][0] or n != params[-1][1]:
80 |                 params.append((p, n, depth, threshold))
81 |             threshold += step
82 |         return params
83 | 
84 |     def _print_params(self, name, params):
85 |         print("*" * 40)
86 |         print(f"* Best {name}")
87 |         print("*" * 40)
88 |         print(f"* Predictive loss = {params[0]:.2f}, {params[1]} rules")
89 |         print(f"* Threshold = {params[3]:.2f}")
90 |         print(f"* Depth = {params[2]}")
91 |         print("*" * 40)
92 | 


--------------------------------------------------------------------------------
/psyke/tuning/pedro/__init__.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from enum import Enum
  4 | 
  5 | from sklearn.metrics import accuracy_score
  6 | 
  7 | from psyke import Extractor, Target
  8 | from psyke.extraction.hypercubic import Grid, FeatureRanker
  9 | from psyke.extraction.hypercubic.strategy import AdaptiveStrategy, FixedStrategy
 10 | from psyke.tuning import Objective, IterativeOptimizer, SKEOptimizer
 11 | 
 12 | 
 13 | class PEDRO(SKEOptimizer, IterativeOptimizer):
 14 |     class Algorithm(Enum):
 15 |         GRIDEX = 1,
 16 |         GRIDREX = 2,
 17 |         HEX = 3
 18 | 
 19 |     def __init__(self, predictor, dataframe: pd.DataFrame, max_error_increase: float = 1.2,
 20 |                  min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, max_depth: int = 3,
 21 |                  patience: int = 3, algorithm: Algorithm = Algorithm.GRIDREX, objective: Objective = Objective.MODEL,
 22 |                  output: Target = Target.CONSTANT, normalization=None, discretization=None):
 23 |         SKEOptimizer.__init__(self, predictor, dataframe, max_error_increase, min_rule_decrease,
 24 |                               readability_tradeoff, patience, objective, output, normalization, discretization)
 25 |         IterativeOptimizer.__init__(self, dataframe, max_error_increase, min_rule_decrease, readability_tradeoff,
 26 |                                     max_depth, patience, output, normalization, discretization)
 27 |         self.algorithm = Extractor.gridrex if algorithm == PEDRO.Algorithm.GRIDREX else \
 28 |             Extractor.gridex if algorithm == PEDRO.Algorithm.GRIDEX else Extractor.hex
 29 |         self.algorithm_name = "GridREx" if algorithm == PEDRO.Algorithm.GRIDREX else \
 30 |             "GridEx" if algorithm == PEDRO.Algorithm.GRIDEX else "HEx"
 31 |         self.ranked = FeatureRanker(dataframe.columns[:-1]).fit(predictor, dataframe.iloc[:, :-1]).rankings()
 32 |         predictions = self.predictor.predict(dataframe.iloc[:, :-1]).flatten()
 33 |         expected = self.dataframe.iloc[:, -1].values
 34 |         self.error = 1 - accuracy_score(predictions, expected) if output == Target.CLASSIFICATION else \
 35 |             abs(predictions - expected).mean()
 36 | 
 37 |     def _search_depth(self, strategy, critical, max_partitions):
 38 |         params, best = [], None
 39 | 
 40 |         for iterations in range(self.max_depth):
 41 |             current_params = self.__search_threshold(Grid(iterations + 1, strategy), critical, max_partitions)
 42 |             current_best = self._best(current_params)[1]
 43 |             print()
 44 |             best, to_break = self._check_iteration_improvement(best, current_best)
 45 |             params += current_params
 46 | 
 47 |             if len(params) > 1 and to_break:
 48 |                 break
 49 |         return params
 50 | 
 51 |     def __search_threshold(self, grid, critical, max_partitions):
 52 |         step = self.error / 2.0
 53 |         threshold = self.error * 0.5
 54 |         params = []
 55 |         patience = self.patience
 56 |         while patience > 0:
 57 |             print("{}. {}. Threshold = {:.2f}. ".format(self.algorithm_name, grid, threshold), end="")
 58 |             param_dict = dict(min_examples=25, threshold=threshold, normalization=self.normalization)
 59 |             if self.algorithm != Extractor.gridrex:
 60 |                 param_dict['output'] = self.output
 61 |             extractor = self.algorithm(self.predictor, grid, **param_dict)
 62 |             _ = extractor.extract(self.dataframe)
 63 |             error_function = (lambda *x: 1 - extractor.accuracy(*x)) if self.output == Target.CLASSIFICATION \
 64 |                 else extractor.mae
 65 |             error, n = (error_function(self.dataframe, self.predictor) if self.objective == Objective.MODEL else
 66 |                         error_function(self.dataframe)), extractor.n_rules
 67 |             print("MAE = {:.2f}, {} rules".format(error, n))
 68 | 
 69 |             if len(params) == 0:
 70 |                 params.append((error, n, threshold, grid))
 71 |                 threshold += step
 72 |                 continue
 73 | 
 74 |             if n > max_partitions:
 75 |                 break
 76 | 
 77 |             if n == 1:
 78 |                 params.append((error, n, threshold, grid))
 79 |                 break
 80 | 
 81 |             if error > params[0][0] * self.max_error_increase:
 82 |                 break
 83 | 
 84 |             improvement = (params[-1][0] / error) + (1 - n / params[-1][1])
 85 | 
 86 |             if improvement <= 1 or n > np.ceil(params[-1][1] * self.min_rule_decrease):
 87 |                 patience -= 1
 88 |                 step = max(step, abs(error - threshold) / max(patience, 1))
 89 |             elif not critical:
 90 |                 patience = self.patience
 91 |             if error != params[-1][0] or n != params[-1][1]:
 92 |                 params.append((error, n, threshold, grid))
 93 |             threshold += step
 94 |         return params
 95 | 
 96 |     def __contains(self, strategies, strategy):
 97 |         for s in strategies:
 98 |             if strategy.equals(s, self.dataframe.columns[:-1]):
 99 |                 return True
100 |         return False
101 | 
102 |     def search(self):
103 |         max_partitions = 200
104 |         base_partitions = FixedStrategy(2).partition_number(self.dataframe.columns[:-1]) * 3
105 |         if base_partitions <= max_partitions:
106 |             strategies = [FixedStrategy(2)]
107 |             if FixedStrategy(3).partition_number(self.dataframe.columns[:-1]) <= max_partitions:
108 |                 strategies.append(FixedStrategy(3))
109 |         else:
110 |             strategies = []
111 |             base_partitions = max_partitions
112 | 
113 |         for n in [2, 3, 5, 10]:
114 |             for th in [0.99, 0.75, 0.67, 0.5, 0.3]:
115 |                 strategy = AdaptiveStrategy(self.ranked, [(th, n)])
116 |                 if strategy.partition_number(self.dataframe.columns[:-1]) < base_partitions and \
117 |                         not self.__contains(strategies, strategy):
118 |                     strategies.append(strategy)
119 | 
120 |         for (a, b) in [(0.33, 0.67), (0.25, 0.75), (0.1, 0.9)]:
121 |             strategy = AdaptiveStrategy(self.ranked, [(a, 2), (b, 3)])
122 |             if strategy.partition_number(self.dataframe.columns[:-1]) < base_partitions and \
123 |                     not self.__contains(strategies, strategy):
124 |                 strategies.append(strategy)
125 | 
126 |         avg = 0.
127 |         for strategy in strategies:
128 |             avg += strategy.partition_number(self.dataframe.columns[:-1])
129 |         avg /= len(strategies)
130 | 
131 |         params = []
132 |         for strategy in strategies:
133 |             params += self._search_depth(strategy,
134 |                                          strategy.partition_number(self.dataframe.columns[:-1]) > avg,
135 |                                          base_partitions)
136 |         self.params = params
137 | 
138 |     def _print_params(self, name, params):
139 |         print("**********************")
140 |         print(f"Best {name}")
141 |         print("**********************")
142 |         print(f"Error = {params[0]:.2f}, {params[1]} rules")
143 |         print(f"Threshold = {params[2]:.2f}")
144 |         print(f"Iterations = {params[3].iterations}")
145 |         print(f"Strategy = {params[3].strategy}")
146 | 


--------------------------------------------------------------------------------
/psyke/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from math import log10
 3 | from random import Random
 4 | 
 5 | _DEFAULT_RANDOM_SEED: int = 123
 6 | 
 7 | ONNX_EXTENSION: str = '.onnx'
 8 | 
 9 | _random_options = dict(_deterministic_mode=True, _default_random_seed=_DEFAULT_RANDOM_SEED)
10 | 
11 | _random_seed_generator: Random = Random(_DEFAULT_RANDOM_SEED)
12 | 
13 | _DEFAULT_PRECISION: float = 1e-6
14 | 
15 | _precision_options: dict = {'precision': _DEFAULT_PRECISION}
16 | 
17 | 
18 | class TypeNotAllowedException(Exception):
19 | 
20 |     def __init__(self, type_name: str):
21 |         super().__init__('Type "' + type_name + '" not allowed for discretization.')
22 | 
23 | 
24 | class Range:
25 |     def __init__(self, mean: float, std: float):
26 |         self.mean = mean
27 |         self.std = std
28 |         self.lower = mean
29 |         self.upper = mean
30 | 
31 |     def left_infinite(self):
32 |         self.lower = float('-inf')
33 | 
34 |     def right_infinite(self):
35 |         self.upper = float('inf')
36 | 
37 |     def expand_left(self):
38 |         self.lower -= self.std
39 | 
40 |     def expand_right(self):
41 |         self.upper += self.std
42 | 
43 | 
44 | def is_deterministic_mode():
45 |     return _random_options['_deterministic_mode']
46 | 
47 | 
48 | def set_deterministic_mode(value: bool):
49 |     _random_options['_deterministic_mode'] = value
50 | 
51 | 
52 | def get_default_random_seed():
53 |     if is_deterministic_mode():
54 |         return _random_options['_default_random_seed']
55 |     else:
56 |         return _random_seed_generator.randint(0, 1 << 64)
57 | 
58 | 
59 | def set_default_random_seed(value: int):
60 |     _random_options['_default_random_seed'] = value
61 | 
62 | 
63 | def get_default_precision() -> float:
64 |     return _precision_options['precision']
65 | 
66 | 
67 | def get_int_precision() -> int:
68 |     return -1 * int(log10(get_default_precision()))
69 | 
70 | 
71 | def set_default_precision(value: float):
72 |     _precision_options['precision'] = value
73 | 
74 | 
75 | class Target(Enum):
76 |     CLASSIFICATION = 1,
77 |     CONSTANT = 2,
78 |     REGRESSION = 3
79 | 


--------------------------------------------------------------------------------
/psyke/utils/dataframe.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import math
  4 | from hashlib import sha256
  5 | from typing import Iterable, List
  6 | import pandas as pd
  7 | from pandas.core.util.hashing import hash_pandas_object
  8 | from pandas.api.types import is_string_dtype, is_numeric_dtype, is_integer_dtype
  9 | from sklearn.preprocessing import StandardScaler
 10 | from sympy.core.containers import OrderedSet
 11 | 
 12 | from psyke import DiscreteFeature
 13 | from psyke.schema import LessThan, GreaterThan, Between, Value, Constant
 14 | from psyke.utils import TypeNotAllowedException, Range
 15 | 
 16 | 
 17 | def split_features(dataframe: pd.DataFrame) -> Iterable[DiscreteFeature]:
 18 |     result = []
 19 |     features = {'V' + str(index + 1): column for index, column in enumerate(dataframe.columns)}
 20 |     for feature, column in features.items():
 21 |         values = set(dataframe[column])
 22 |         result.append(DiscreteFeature(feature, {feature + '_' + str(i): v for i, v in enumerate(values)}))
 23 |     return result
 24 | 
 25 | 
 26 | def get_discrete_features_supervised(dataframe: pd.DataFrame) -> Iterable[DiscreteFeature]:
 27 |     result = OrderedSet()
 28 |     for feature in dataframe.columns[:-1]:
 29 |         result.add(DiscreteFeature(feature, create_set(feature, dataframe)))
 30 |     return result
 31 | 
 32 | 
 33 | def create_set(feature: str, dataframe: pd.DataFrame) -> dict[str, Value]:
 34 |     if is_string_dtype(dataframe[feature]) or is_integer_dtype(dataframe[feature]):
 35 |         values = dataframe[feature].unique()
 36 |     elif is_numeric_dtype(dataframe[feature]):
 37 |         values = create_ranges(feature, dataframe)
 38 |     else:
 39 |         raise TypeNotAllowedException(dataframe[feature].dtype)
 40 |     return {"{}_{}".format(feature, i): create_original_value(v) for (i, v) in enumerate(values)}
 41 | 
 42 | 
 43 | def create_original_value(value: Range | str | int) -> Value:
 44 |     if isinstance(value, Range):
 45 |         if value.lower == float('-inf'):
 46 |             return LessThan(value.upper)
 47 |         if value.upper == float('inf'):
 48 |             return GreaterThan(value.lower)
 49 |         return Between(value.lower, value.upper)
 50 |     return Constant(value)
 51 | 
 52 | 
 53 | def create_ranges(feature: str, dataframe: pd.DataFrame) -> Iterable[Range]:
 54 |     ranges = init_ranges(feature, dataframe)
 55 |     expand_ranges(ranges)
 56 |     ranges[0].left_infinite()
 57 |     ranges[-1].right_infinite()
 58 |     return ranges
 59 | 
 60 | 
 61 | def expand_ranges(ranges: Iterable[Range]):
 62 |     for r1, r2 in zip(ranges[0:-1], ranges[1:]):
 63 |         while r1.upper < r2.lower:
 64 |             r1.expand_right()
 65 |             r2.expand_left()
 66 |         mean = ((r1.upper - r1.std + r2.lower + r2.std) / 2)
 67 |         r1.upper = mean
 68 |         r2.lower = mean
 69 | 
 70 | 
 71 | def init_ranges(feature: str, dataframe: pd.DataFrame) -> Iterable[Range]:
 72 |     desc = [dataframe[dataframe.iloc[:, -1] == v].describe()[feature] for v in dataframe.iloc[:, -1].unique()]
 73 |     desc = [(d['mean'], d['std']) for d in desc]
 74 |     desc.sort()
 75 |     return [Range(d[0], d[1]) for d in desc]
 76 | 
 77 | 
 78 | def get_discrete_features_equal_frequency(
 79 |         dataframe: pd.DataFrame,
 80 |         bins: int = None,
 81 |         output: bool = True,
 82 |         bin_names: List[str] = []
 83 | ) -> Iterable[DiscreteFeature]:
 84 |     features = dataframe.columns[:-1] if output else dataframe.columns
 85 |     result = set()
 86 |     if bins is None:
 87 |         if len(bin_names) > 0:
 88 |             bins = len(bin_names)
 89 |         else:
 90 |             raise ValueError("No bins nor bin_names have been provided")
 91 |     elif bins > 0:
 92 |         if len(bin_names) == 0:
 93 |             bin_names = range(0, bins)
 94 |         elif len(bin_names) == bins:
 95 |             pass
 96 |         else:
 97 |             raise ValueError("Mismatch among the provided amount of bins and the bin_names")
 98 |     else:
 99 |         raise ValueError("Negative amount of bins makes no sense")
100 |     for feature in features:
101 |         values = sorted(dataframe[feature])
102 |         intervals = [values[i * math.ceil(len(values) / bins)] for i in range(1, bins)]
103 |         starting_interval: list[Value] = [LessThan(intervals[0])]
104 |         ending_interval: list[Value] = [GreaterThan(intervals[-1])]
105 |         middle_intervals: list[Value] = [Between(intervals[i], intervals[i + 1]) for i in range(0, len(intervals) - 1)]
106 |         new_intervals = starting_interval + middle_intervals + ending_interval
107 |         new_feature_names = [feature + '_' + str(i) for i in range(0, bins)]
108 |         new_features = {new_feature_names[i]: new_intervals[i] for i in range(0, bins)}
109 |         result.add(DiscreteFeature(feature, new_features))
110 |     return result
111 | 
112 | 
113 | def get_discrete_dataset(dataset: pd.DataFrame, discrete_features: Iterable[DiscreteFeature],
114 |                          sort: bool = True) -> pd.DataFrame:
115 |     """
116 |     Create a new dataset mapping the old features into the new discrete features.
117 |     Note: some algorithms require the same SORTED feature to be replicable due to rule optimization and other stuffs.
118 |     Therefore the new features are alphabetically sorted.
119 |     This is not strictly necessary because internally those algorithms perform the sorting themself.
120 |     However it is a good idea to have this same function returning the same result w.r.t. the inputs.
121 | 
122 |     :param dataset: the original dataset
123 |     :param discrete_features: mapping for the features
124 |     :param sort: alphabetically sort new features
125 |     :return: the new discrete dataset
126 |     """
127 |     columns_name = [key for feature in discrete_features for key, _ in feature.admissible_values.items()]
128 |     if sort:
129 |         columns_name = sorted(columns_name)
130 |     new_dataset = pd.DataFrame(columns=columns_name)
131 |     for feature in discrete_features:
132 |         for index, value in enumerate(dataset[feature.name]):
133 |             for key, admissible_value in feature.admissible_values.items():
134 |                 new_dataset.loc[index, key] = int(admissible_value.is_in(value))
135 | 
136 |     for feature in discrete_features:
137 |         for new_feature in feature.admissible_values.keys():
138 |             new_dataset[new_feature] = new_dataset[new_feature].astype(str).astype(int)
139 | 
140 |     return new_dataset
141 | 
142 | 
143 | def get_scaled_dataset(dataset: pd.DataFrame) -> tuple[pd.DataFrame, dict[str, tuple[float, float]]]:
144 |     scaler = StandardScaler()
145 |     scaler.fit(dataset)
146 |     normalization = {key: (m, s) for key, m, s in zip(dataset.columns, scaler.mean_, scaler.scale_)}
147 |     return pd.DataFrame(scaler.transform(dataset), columns=dataset.columns, index=dataset.index), normalization
148 | 
149 | 
150 | def scale_dataset(dataset: pd.DataFrame, normalization: dict[str, tuple[float, float]]) -> pd.DataFrame:
151 |     new_data = pd.DataFrame()
152 |     for column in dataset.columns:
153 |         m, s = normalization[column]
154 |         new_data[column] = (dataset[column] - m) / s
155 |     return new_data
156 | 
157 | 
158 | class HashableDataFrame(pd.DataFrame):
159 |     def __init__(self, obj):
160 |         super().__init__(obj)
161 | 
162 |     def __hash__(self):
163 |         hash_value = sha256(hash_pandas_object(self, index=True).values)
164 |         hash_value = hash(hash_value.hexdigest())
165 |         return hash_value
166 | 
167 |     def __eq__(self, other):
168 |         return self.equals(other)
169 | 


--------------------------------------------------------------------------------
/psyke/utils/metrics.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, f1_score
 4 | 
 5 | 
 6 | def mae(expected, predicted):
 7 |     """
 8 |     Calculates the predictions' MAE w.r.t. the instances given as input.
 9 | 
10 |     :param expected: the expected data .
11 |     :param predicted: the predicted data.
12 |     :return: the mean absolute error (MAE) of the predictions.
13 |     """
14 |     return score(expected, predicted, mean_absolute_error)
15 | 
16 | 
17 | def mse(expected, predicted):
18 |     """
19 |     Calculates the predictions' MSE w.r.t. the instances given as input.
20 | 
21 |     :param expected: the expected data .
22 |     :param predicted: the predicted data.
23 |     :return: the mean squared error (MSE) of the predictions.
24 |     """
25 |     return score(expected, predicted, mean_squared_error)
26 | 
27 | 
28 | def r2(expected, predicted):
29 |     """
30 |     Calculates the predictions' R2 w.r.t. the instances given as input.
31 | 
32 |     :param expected: the expected data .
33 |     :param predicted: the predicted data.
34 |     :return: the R2 score of the predictions.
35 |     """
36 |     return score(expected, predicted, r2_score)
37 | 
38 | 
39 | def accuracy(expected, predicted):
40 |     """
41 |     Calculates the predictions' classification accuracy w.r.t. the instances given as input.
42 | 
43 |     :param expected: the expected data .
44 |     :param predicted: the predicted data.
45 |     :return: the classification accuracy of the predictions.
46 |     """
47 |     return score(expected, predicted, accuracy_score)
48 | 
49 | 
50 | def f1(expected, predicted):
51 |     """
52 |     Calculates the predictions' F1 score w.r.t. the instances given as input.
53 | 
54 |     :param expected: the expected data .
55 |     :param predicted: the predicted data.
56 |     :return: the F1 score of the predictions.
57 |     """
58 |     return score(expected, predicted, partial(f1_score, average='weighted'))
59 | 
60 | 
61 | def score(expected, predicted, scoring_function):
62 |     """
63 |     Calculates the predictions' score w.r.t. the instances given as input with the provided scoring function.
64 | 
65 |     :param expected: the expected data .
66 |     :param predicted: the predicted data.
67 |     :param scoring_function: the scoring function to be used.
68 |     :return: the score of the predictions.
69 |     """
70 |     idx = [prediction is not None for prediction in predicted]
71 |     return scoring_function(expected[idx], predicted[idx])
72 | 


--------------------------------------------------------------------------------
/psyke/utils/plot.py:
--------------------------------------------------------------------------------
  1 | from array import array
  2 | from typing import Callable, Iterable
  3 | import numpy as np
  4 | import pandas as pd
  5 | from matplotlib import colors
  6 | import matplotlib.pyplot as plt
  7 | from matplotlib.lines import Line2D
  8 | from tuprolog.solve.prolog import prolog_solver
  9 | from tuprolog.theory import Theory, mutable_theory
 10 | 
 11 | from psyke.extraction.hypercubic import HyperCubeExtractor
 12 | from psyke.utils.logic import data_to_struct, get_in_rule, get_not_in_rule
 13 | 
 14 | import matplotlib
 15 | #matplotlib.use('TkAgg')
 16 | 
 17 | 
 18 | def plot_init(xlim, ylim, xlabel, ylabel, size=(4, 3), equal=False):
 19 |     plt.figure(figsize=size)
 20 |     if equal:
 21 |         plt.gca().set_aspect(1)
 22 |     plt.xlim(xlim)
 23 |     plt.ylim(ylim)
 24 |     plt.gca().set_xlabel(xlabel)
 25 |     plt.gca().set_ylabel(ylabel)
 26 |     plt.gca().set_rasterized(True)
 27 | 
 28 | 
 29 | def plot_point(x, y, color, marker, ec=None):
 30 |     plt.scatter(x, y, c=color, marker=marker, edgecolors=ec, linewidths=0.6)
 31 | 
 32 | 
 33 | def plot_classification_samples(dataframe, classes, colors, markers, labels, loc, name, show=True, ec=None):
 34 |     marks = [Line2D([0], [0], color=c, marker=m, lw="0") for c, m in zip(colors, markers)]
 35 | 
 36 |     for cl, c, m in zip(classes, colors, markers):
 37 |         df = dataframe[dataframe.target == cl]
 38 |         plot_point(df["petal length"], df["petal width"], c, m, ec=ec)
 39 | 
 40 |     plt.gca().legend(marks, labels, loc=loc)
 41 |     plt.savefig("plot/{}.pdf".format(name), dpi=500, bbox_inches='tight')
 42 |     if show:
 43 |         plt.show()
 44 | 
 45 | 
 46 | def plot_boundaries(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str],
 47 |                     a: float = .5, h: str = '////////', ls='-', e=.05, fc='none', ec=None, reverse=False):
 48 |     cubes = extractor._hypercubes.copy()
 49 |     if reverse:
 50 |         cubes.reverse()
 51 |     for cube in cubes:
 52 |         plt.gca().fill_between((cube[x][0] - e, cube[x][1] + e), cube[y][0] - e, cube[y][1] + e,
 53 |                                fc=colors[cube.output] if fc is None else fc,
 54 |                                ec=colors[cube.output] if ec is None else ec, alpha=a, hatch=h, linestyle=ls)
 55 | 
 56 | 
 57 | def plot_surfaces(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str], ec='r', e=.05):
 58 |     for cube in extractor._hypercubes:
 59 |         plt.gca().fill_between((cube[x][0] - e, cube[x][1] + e), cube[y][0] - e, cube[y][1] + e,
 60 |                                fc='none', ec=ec)
 61 | 
 62 | 
 63 | def plot_perimeters(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str], n: int = 5,
 64 |                     ec: str = 'r', m: str = '*', s: int = 60, z: float = 1e10, lw: float = 0.8):
 65 |     for cube in extractor._hypercubes:
 66 |         for corner in cube.perimeter_samples(n):
 67 |             plt.scatter(corner[x], corner[y], c=colors[cube.output], marker=m, edgecolor=ec, s=s, zorder=z, linewidth=lw)
 68 | 
 69 | 
 70 | def plot_centers(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str],
 71 |                  ec: str = 'r', m: str = '*', s: int = 60, z: float = 1e10, lw: float = 0.8):
 72 |     for cube in extractor._hypercubes:
 73 |         center = cube.center
 74 |         plt.scatter(center[x], center[y], c=colors[cube.output], marker=m, edgecolor=ec, s=s, zorder=z, linewidth=lw)
 75 | 
 76 | 
 77 | def plot_corners(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str],
 78 |                  ec: str = 'r', m: str = '*', s: int = 60, z: float = 1e10, lw: float = 0.8):
 79 |     for cube in extractor._hypercubes:
 80 |         for corner in cube.corners():
 81 |             plt.scatter(corner[x], corner[y], c=colors[cube.output], marker=m, edgecolor=ec, s=s, zorder=z, linewidth=lw)
 82 | 
 83 | 
 84 | def plot_barycenters(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str],
 85 |                  ec: str = 'r', m: str = '*', s: int = 60, z: float = 1e10, lw: float = 0.8):
 86 |     for cube in extractor._hypercubes:
 87 |         center = cube.barycenter
 88 |         plt.scatter(center[x], center[y], c=colors[cube.output], marker=m, edgecolor=ec, s=s, zorder=z, linewidth=lw)
 89 | 
 90 | 
 91 | def predict_from_theory(theory: Theory, data: pd.DataFrame) -> list[float or str]:
 92 |     solver = prolog_solver(static_kb=mutable_theory(theory).assertZ(get_in_rule()).assertZ(get_not_in_rule()))
 93 |     index = data.shape[1] - 1
 94 |     y_element = data.iloc[0, -1]
 95 |     cast: Callable = lambda x: (str(x) if isinstance(y_element, str) else x)
 96 |     substitutions = [solver.solveOnce(data_to_struct(data)) for _, data in data.iterrows()]
 97 |     return [cast(query.solved_query.get_arg_at(index)) if query.is_yes else -1 for query in substitutions]
 98 | 
 99 | 
100 | def plot_theory(theory: Theory, data: pd.DataFrame = None, output: str = 'plot.pdf', azimuth: float = 45,
101 |                 distance: float = 9, elevation: float = 5, show_theory: bool = True, features: Iterable[str] = None) -> None:
102 |     # Check if the number of common variables in clauses is less or equal to three.
103 |     # If not raise an exception.
104 |     fresh_theory = mutable_theory(theory)
105 |     clauses = fresh_theory.clauses
106 |     variables = sorted(list(set(arg.args[0].name.split('_')[0] for clause in clauses if clause.body_size > 0 and clause.body.is_recursive for arg in clause.body.unfolded)), reverse=True)
107 |     if len(variables) > 3:
108 |         raise Exception("Theory contains too many different features in the body of clauses, maximum is 3.")
109 |     # If data is None, then create synthetic data covering a good portion of the variables space.
110 |     # Just skip for now.
111 |     if data is None:
112 |         raise Exception("Method without data is not implemented yet")
113 | 
114 |     # Prepare data
115 |     ys = predict_from_theory(fresh_theory, data)
116 |     xs = data[variables].values.tolist()
117 |     for i in range(len(ys)):
118 |         xs[i].append(ys[i])
119 | 
120 |     # Prepare colors
121 |     if isinstance(ys[0], str):
122 |         np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
123 | 
124 |         class ColorGenerator:
125 | 
126 |             def __init__(self):
127 |                 self.color_list = ['red', 'royalblue', 'green', 'orange', 'pink', 'acqua', 'grey']
128 |                 self.counter = 0
129 | 
130 |             def get_new_color(self) -> str:
131 |                 self.counter += 1
132 |                 if self.counter > len(self.color_list):
133 |                     raise Exception("Classes exceed the maximum supported number (7)")
134 |                 return self.color_list[self.counter - 1]
135 | 
136 |         classes = set(ys)
137 |         generator = ColorGenerator()
138 |         class_color = {c: generator.get_new_color() for c in classes}
139 |         get_color: Callable = lambda c: class_color[c]
140 |     else:
141 |         def color_fader(v: float = 0., c1: str = 'green', c2: str = 'red'):
142 |             c1 = array(colors.to_rgb(c1))
143 |             c2 = array(colors.to_rgb(c2))
144 |             return colors.to_hex((1 - v) * c1 + v * c2)
145 |         min_value = min(ys)
146 |         max_value = max(ys)
147 |         get_normalized_value: Callable = lambda v: (v - min_value)/(max_value - min_value)
148 |         get_color: Callable = lambda c: color_fader(get_normalized_value(c))
149 | 
150 |     fig = plt.figure()
151 |     fig.set_size_inches(10, 10)
152 |     if len(variables) == 3:
153 |         ax = fig.add_subplot(projection='3d')
154 |     else:
155 |         ax = fig.add_subplot()
156 | 
157 |     for x in xs:
158 |         ax.scatter(*x[:-1], c=get_color(x[-1]), s=14)
159 | 
160 |     ax.set_xlabel(variables[0], fontsize=18)
161 |     ax.set_ylabel(variables[1], fontsize=18)
162 |     if len(variables) == 3:
163 |         ax.set_zlabel(variables[2], fontsize=18)
164 | 
165 |     ax.azim = azimuth
166 |     ax.dist = distance
167 |     ax.elev = elevation
168 |     ax.set_title('Predictions according to Prolog theory', fontsize=24)
169 |     if show_theory:
170 |         pass
171 |         # ax.text2D(0., 0.88, pretty_theory(theory, new_line=False), transform=ax.transAxes, fontsize=8)
172 |     if isinstance(ys[0], str):
173 |         custom_lines = [Line2D([0], [0], marker='o', markerfacecolor=get_color(c),
174 |                                markersize=20, color='w') for c in classes]
175 |         ax.legend(custom_lines, classes, loc='upper left', numpoints=1, ncol=3, fontsize=18, bbox_to_anchor=(0, 0))
176 |     plt.savefig(output, format='pdf')
177 | 


--------------------------------------------------------------------------------
/psyke/utils/sorted.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Any
 2 | 
 3 | 
 4 | class SortedList(list):
 5 | 
 6 |     def __init__(self, comparator: Callable[[Any, Any], int]):
 7 |         super().__init__()
 8 |         self.comparator = comparator
 9 | 
10 |     def add(self, item) -> None:
11 |         if len(self) == 0:
12 |             self.insert(0, item)
13 |         else:
14 |             starting_len = len(self)
15 |             for index, element in enumerate(self):
16 |                 if self.comparator(element, item) > 0:
17 |                     self.insert(index, item)
18 |                     break
19 |             if len(self) == starting_len:
20 |                 self.append(item)
21 | 
22 |     def add_all(self, other) -> None:
23 |         for item in other:
24 |             self.add(item)
25 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 | 


--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "extends": [
 3 |     "config:base",
 4 |     ":rebaseStalePrs",
 5 |     ":semanticCommits",
 6 |     "docker:disable"
 7 |   ],
 8 |   "assignees": [
 9 |     "MatteoMagnini"
10 |   ],
11 |   "automerge": true,
12 |   "dependencyDashboard": true,
13 |   "git-submodules": {
14 |     "enabled": true
15 |   },
16 |   "includeForks": true,
17 |   "packageRules": [
18 |     {
19 |       "description": "Updates to GitHub Actions should be tagged as 'ci'",
20 |       "matchPaths": [
21 |         ".github/workflows/*.yml",
22 |         ".github/workflows/*.yaml"
23 |       ],
24 |       "semanticCommitType": "ci"
25 |     },
26 |     {
27 |       "matchPackageNames": ["net.sourceforge.plantuml:plantuml"],
28 |       "allowedVersions": "/^1\\./"
29 |     }
30 |   ],
31 |   "prConcurrentLimit": 25,
32 |   "prHourlyLimit": 0,
33 |   "separateMajorMinor": true,
34 |   "separateMinorPatch": true,
35 |   "separateMultipleMajor": true
36 | }
37 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | build==1.2.2.post1
 2 | twine==6.1.0
 3 | numpy==1.26.4
 4 | pandas==2.3.0
 5 | scikit-learn==1.6.1
 6 | 2ppy==0.4.1
 7 | skl2onnx==1.18.0
 8 | onnxruntime==1.19.2
 9 | tensorflow==2.16.2
10 | parameterized==0.9.0
11 | protobuf==4.25.8
12 | setuptools==80.9.0
13 | kneed==0.8.5
14 | sympy==1.14.0
15 | matplotlib==3.9.4
16 | joblib==1.5.1
17 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | from setuptools import setup, find_packages
  2 | import pathlib
  3 | import subprocess
  4 | import distutils.cmd
  5 | 
  6 | here = pathlib.Path(__file__).parent.resolve()
  7 | 
  8 | version_file = here / 'VERSION'
  9 | 
 10 | # Get the long description from the README file
 11 | long_description = (here / 'README.md').read_text(encoding='utf-8')
 12 | 
 13 | 
 14 | EPOCHS: int = 50
 15 | BATCH_SIZE: int = 16
 16 | REQUIREMENTS = [
 17 |     'numpy~=1.26.0',
 18 |     'pandas~=2.3.0',
 19 |     'scikit-learn~=1.6.0',
 20 |     '2ppy~=0.4.0',
 21 |     'kneed~=0.8.1',
 22 |     'sympy~=1.11'
 23 | ]  # Optional
 24 | 
 25 | 
 26 | def format_git_describe_version(version):
 27 |     if '-' in version:
 28 |         splitted = version.split('-')
 29 |         tag = splitted[0]
 30 |         index = f"dev{splitted[1]}"
 31 |         return f"{tag}.{index}"
 32 |     else:
 33 |         return version
 34 | 
 35 | 
 36 | def get_version_from_git():
 37 |     try:
 38 |         process = subprocess.run(["git", "describe"], cwd=str(here), check=True, capture_output=True)
 39 |         version = process.stdout.decode('utf-8').strip()
 40 |         version = format_git_describe_version(version)
 41 |         with version_file.open('w') as f:
 42 |             f.write(version)
 43 |         return version
 44 |     except subprocess.CalledProcessError:
 45 |         if version_file.exists():
 46 |             return version_file.read_text().strip()
 47 |         else:
 48 |             return '0.1.0.archeo'
 49 | 
 50 | 
 51 | version = get_version_from_git()
 52 | 
 53 | 
 54 | print(f"Detected version {version} from git describe")
 55 | 
 56 | 
 57 | class GetVersionCommand(distutils.cmd.Command):
 58 |     """A custom command to get the current project version inferred from git describe."""
 59 | 
 60 |     description = 'gets the project version from git describe'
 61 |     user_options = []
 62 | 
 63 |     def initialize_options(self):
 64 |         pass
 65 | 
 66 |     def finalize_options(self):
 67 |         pass
 68 | 
 69 |     def run(self):
 70 |         print(version)
 71 | 
 72 | 
 73 | #class CreateTestPredictors(distutils.cmd.Command):
 74 | #    description = 'gets the project version from git describe'
 75 | #    user_options = []
 76 | 
 77 | #    def initialize_options(self):
 78 | #        pass
 79 | 
 80 | #    def finalize_options(self):
 81 | #        pass
 82 | 
 83 | #    def run(self):
 84 | #        from psyke.utils import get_default_random_seed
 85 | #        from psyke.utils.dataframe import get_discrete_dataset
 86 | #        from sklearn.model_selection import train_test_split
 87 | #        from test import REQUIRED_PREDICTORS, get_dataset, get_model, get_schema
 88 | #        from test.resources.predictors import get_predictor_path, PATH, create_predictor_name
 89 | #        import ast
 90 | #        import pandas as pd
 91 | #        from tensorflow.keras import Model
 92 | #        from test import Predictor
 93 | 
 94 |         # Read the required predictors to run the tests:
 95 |         #   model | model_options | dataset
 96 | #        required_predictors = pd.read_csv(REQUIRED_PREDICTORS, sep=';')
 97 | 
 98 |         # Create missing predictors.
 99 |         #     model | model_options | dataset
100 | #        for index, row in required_predictors.iterrows():
101 | #            options = ast.literal_eval(row['model_options'])
102 | #            file_name = create_predictor_name(row['dataset'], row['model'], options)
103 | #            if not get_predictor_path(file_name).is_file():
104 | #                dataset = get_dataset(row['dataset'])
105 | #                if row['bins'] > 0:
106 | #                    schema = get_schema(dataset)  # int(row['bins'])
107 | #                    dataset = get_discrete_dataset(dataset.iloc[:, :-1], schema).join(dataset.iloc[:, -1])
108 | #                model, _ = get_model(row['model'], options)
109 | #                training_set, test_set = train_test_split(dataset, test_size=0.5,
110 | #                                                          random_state=get_default_random_seed())
111 | #                if isinstance(model, Model):
112 | #                    keys = set(training_set.iloc[:, -1])
113 | #                    mapping = {key: i for i, key in enumerate(keys)}
114 | #                    training_set.iloc[:, -1] = training_set.iloc[:, -1].apply(lambda x: mapping[x])
115 | #                    test_set.iloc[:, -1] = test_set.iloc[:, -1].apply(lambda x: mapping[x])
116 | #                    model.fit(training_set.iloc[:, :-1], training_set.iloc[:, -1], epochs=EPOCHS, batch_size=BATCH_SIZE)
117 | #                else:
118 | #                    model.fit(training_set.iloc[:, :-1], training_set.iloc[:, -1])
119 | #                predictor = Predictor(model)
120 | #                predictor.save_to_onnx(PATH / file_name, Predictor.get_initial_types(training_set.iloc[:, :-1]))
121 | 
122 | #        required_predictors.to_csv(REQUIRED_PREDICTORS, sep=';', index=False)
123 | 
124 | #        print("Done")
125 | 
126 | 
127 | class CreateTheoryPlot(distutils.cmd.Command):
128 |     description = 'create a plot representing samples X and their class/regression value Y predicted by a theory'
129 |     user_options = [('theory=', 't', 'textual file of a Prolog theory'),
130 |                     ('dataset=', 'd', 'file of a dataset'),
131 |                     ('azimuth=', 'a', 'azimuth of the plot'),
132 |                     ('distance=', 'D', 'distance from the plot'),
133 |                     ('elevation=', 'e', 'elevation of the plot'),
134 |                     ('output=', 'o', 'output file name of the plot'),
135 |                     ('show=', 's', 'show theory in the plot ([y]/n)'),
136 |                     ]
137 |     default_output_file_name = 'dummy/plot'
138 |     default_theory_name = 'dummy/iris-theory'
139 |     default_dataset_name = 'dummy/iris'
140 |     default_azimuth = '45'
141 |     default_distance = '9'
142 |     default_elevation = '5'
143 |     csv_format = '.csv'
144 |     txt_format = '.txt'
145 |     pdf_format = '.pdf'
146 | 
147 |     def initialize_options(self):
148 |         self.output = self.default_output_file_name
149 |         self.theory = self.default_theory_name
150 |         self.dataset = self.default_dataset_name
151 |         self.azimuth = self.default_azimuth
152 |         self.elevation = self.default_elevation
153 |         self.distance = self.default_distance
154 |         self.show = True
155 | 
156 |     def finalize_options(self):
157 |         self.theory_file = str(self.theory)
158 |         self.data = str(self.dataset)
159 |         self.output = str(self.output)
160 |         self.a = float(self.azimuth)
161 |         self.e = float(self.elevation)
162 |         self.d = float(self.distance)
163 |         self.s = self.show in (True, 'y', 'Y', 'yes', 'YES', 'Yes')
164 | 
165 |     def run(self):
166 |         import pandas as pd
167 |         from tuprolog.theory.parsing import parse_theory
168 |         from psyke.utils.plot import plot_theory
169 | 
170 |         if self.theory_file is None or self.theory_file == '':
171 |             raise Exception('Empty theory file name')
172 |         if self.data is None or self.data == '':
173 |             raise Exception('Empty dataset file name')
174 |         with open(self.theory_file + (self.txt_format if '.' not in self.theory_file else ''), 'r') as file:
175 |             textual_theory = file.read()
176 |         theory = parse_theory(textual_theory)
177 |         data = pd.read_csv(self.data + (self.csv_format if '.' not in self.data else ''))
178 |         plot_theory(theory, data, self.output + self.pdf_format, self.a, self.d, self.e, self.s)
179 | 
180 | 
181 | setup(
182 |     name='psyke',  # Required
183 |     version=version,
184 |     description='Python-based implementation of PSyKE, i.e. a Platform for Symbolic Knowledge Extraction',
185 |     license='Apache 2.0 License',
186 |     long_description=long_description,
187 |     long_description_content_type='text/markdown',
188 |     url='https://github.com/psykei/psyke-python',
189 |     author='Matteo Magnini',
190 |     author_email='matteo.magnini@unibo.it',
191 |     classifiers=[
192 |         'Development Status :: 3 - Alpha',
193 |         'Intended Audience :: Developers',
194 |         'Topic :: Software Development :: Libraries',
195 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
196 |         'License :: OSI Approved :: Apache Software License',
197 |         'Programming Language :: Python :: 3',
198 |         'Programming Language :: Python :: 3.9',
199 |         'Programming Language :: Python :: 3 :: Only',
200 |         'Programming Language :: Prolog'
201 |     ],
202 |     keywords='knowledge extraction, symbolic ai, ske, extractor, rules, prolog',  # Optional
203 |     # package_dir={'': 'src'},  # Optional
204 |     packages=find_packages('.'),  # Required
205 |     include_package_data=True,
206 |     python_requires='>=3.9.0, <3.10',
207 |     install_requires=REQUIREMENTS,  # Optional
208 |     zip_safe=False,
209 |     platforms="Independant",
210 |     project_urls={  # Optional
211 |         'Bug Reports': 'https://github.com/psykei/psyke-python/issues',
212 |         # 'Funding': 'https://donate.pypi.org',
213 |         # 'Say Thanks!': 'http://saythanks.io/to/example',
214 |         'Source': 'https://github.com/psykei/psyke-python',
215 |     },
216 |     cmdclass={
217 |         'get_project_version': GetVersionCommand,
218 |         # 'create_test_predictors': CreateTestPredictors,
219 |         'create_theory_plot': CreateTheoryPlot
220 |     },
221 | )
222 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | from typing import Iterable, Union
  5 | import numpy as np
  6 | import onnxruntime
  7 | import pandas as pd
  8 | from keras import Input, Model
  9 | from keras.src.layers import Dense
 10 | #from tensorflow.python.saved_model.save import save
 11 | from tensorflow.saved_model import save
 12 | from onnxconverter_common import FloatTensorType, Int64TensorType, StringTensorType, DataType
 13 | #from skl2onnx import convert_sklearn
 14 | from sklearn.ensemble import RandomForestRegressor
 15 | from sklearn.neighbors import KNeighborsClassifier
 16 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 17 | from tensorflow.random import set_seed
 18 | import tensorflow as tf
 19 | #from tensorflow.keras import Input, Model
 20 | #from tensorflow.keras.layers import Dense
 21 | from psyke.schema import DiscreteFeature, Value
 22 | from psyke.utils import get_default_random_seed
 23 | from sklearn.datasets import fetch_california_housing, load_iris
 24 | from psyke import Extractor
 25 | from psyke.utils.dataframe import get_discrete_features_supervised
 26 | from test.resources.predictors import PATH, get_predictor_path
 27 | 
 28 | REQUIRED_PREDICTORS: str = PATH / '.required.csv'
 29 | LE = '=<'
 30 | GE = '>='
 31 | L = '<'
 32 | G = '>'
 33 | 
 34 | 
 35 | def get_extractor(extractor_type: str, parameters: dict):
 36 |     if extractor_type.lower() == 'cart':
 37 |         return Extractor.cart(**parameters)
 38 |     elif extractor_type.lower() == 'iter':
 39 |         return Extractor.iter(**parameters)
 40 |     elif extractor_type.lower() == 'real':
 41 |         return Extractor.real(**parameters)
 42 |     elif extractor_type.lower() == 'trepan':
 43 |         return Extractor.trepan(**parameters)
 44 |     elif extractor_type.lower() == 'gridex':
 45 |         return Extractor.gridex(**parameters)
 46 |     else:
 47 |         raise NotImplementedError(extractor_type + ' not implemented yet.')
 48 | 
 49 | 
 50 | def get_model(model_type: str, parameters: dict):
 51 |     if model_type.lower() == 'rfr':
 52 |         return RandomForestRegressor(**parameters, random_state=np.random.seed(get_default_random_seed())), False
 53 |     elif model_type.lower() == 'knnc':
 54 |         return KNeighborsClassifier(**parameters), False  # It's deterministic, don't have a random_state
 55 |     elif model_type.lower() == 'dtc':
 56 |         return DecisionTreeClassifier(max_depth=3, random_state=np.random.seed(get_default_random_seed())), False
 57 |     elif model_type.lower() == 'dtr':
 58 |         return DecisionTreeRegressor(max_depth=3, random_state=np.random.seed(get_default_random_seed())), False
 59 |     elif model_type.lower() == 'nn':
 60 |         return get_simple_neural_network(**parameters, random_state=np.random.seed(get_default_random_seed())), False
 61 |     else:
 62 |         return Predictor.load_from_onnx(str(get_predictor_path(model_type))), True
 63 | 
 64 | 
 65 | def get_simple_neural_network(input: int = 4, output: int = 3, layers: int = 3, neurons: int = 32,
 66 |                               random_state: int = np.random.seed(get_default_random_seed())) -> Model:
 67 |     set_seed(random_state)
 68 |     input_layer = Input(input)
 69 |     x = input_layer
 70 |     for _ in range(layers-1):
 71 |         x = Dense(neurons, activation='relu')(x)
 72 |     x = Dense(output, activation='softmax')(x)
 73 |     model = Model(input_layer, x)
 74 |     model.compile('adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
 75 |     return model
 76 | 
 77 | 
 78 | def get_dataset(name: str):
 79 |     if name.lower() == 'house':
 80 |         x, y = fetch_california_housing(return_X_y=True, as_frame=True)
 81 |         normalized_x = _normalize_data(x)
 82 |         normalized_y = _normalize_data(y)
 83 |         return normalized_x.join(normalized_y)
 84 |     elif name.lower() == 'iris':
 85 |         x, y = load_iris(return_X_y=True, as_frame=True)
 86 |         y = pd.DataFrame(y).replace({"target": {0: 'setosa', 1: 'versicolor', 2: 'virginica'}})
 87 |         result = x.join(y)
 88 |         result.columns = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'iris']
 89 |         return result
 90 |     else:
 91 |         raise Exception('unknown dataset name.')
 92 | 
 93 | 
 94 | def _normalize_data(x: pd.DataFrame) -> pd.DataFrame:
 95 |     return (x - x.min()) / (x.max() - x.min())
 96 | 
 97 | 
 98 | def get_schema(dataset: pd.DataFrame) -> Union[Iterable[DiscreteFeature], None]:
 99 |     return get_discrete_features_supervised(dataset)
100 |     # return SCHEMAS[filename] if filename in SCHEMAS.keys() else None
101 | 
102 | 
103 | def _get_admissible_values(prepositions: Iterable[str]) -> dict[str, Value]:
104 |     raise NotImplementedError('Automatic schema reading not implemented yet.')
105 | 
106 | 
107 | class Predictor:
108 | 
109 |     def __init__(self, model, from_file_onnx=False):
110 |         self._model = model
111 |         self._from_file_onnx = from_file_onnx
112 | 
113 |     @staticmethod
114 |     def load_from_onnx(file: str) -> Predictor:
115 |         return Predictor(onnxruntime.InferenceSession(file), True)
116 | 
117 |     #def save_to_onnx(self, file, initial_types: list[tuple[str, DataType]]):
118 |     #    file = str(file) + '.onnx'
119 |     #    if not self._from_file_onnx:
120 |     #        if os.path.exists(file):
121 |     #            os.remove(file)
122 |     #        if isinstance(self._model, Model):
123 |     #            save(self._model, "tmp_model")
124 |     #            os.system("python -m tf2onnx.convert --saved-model tmp_model --output " + file)
125 |     #        else:
126 |     #            onnx_predictor = convert_sklearn(self._model, initial_types=initial_types)
127 |     #            with open(file, 'wb') as f:
128 |     #                f.write(onnx_predictor.SerializeToString())
129 | 
130 |     def predict(self, dataset: pd.DataFrame | np.ndarray) -> Iterable:
131 |         array = dataset.to_numpy() if isinstance(dataset, pd.DataFrame) else dataset
132 |         if self._from_file_onnx:
133 |             input_name = self._model.get_inputs()[0].name
134 |             label_name = self._model.get_outputs()[0].name
135 |             if array.dtype == 'float64':
136 |                 tensor_type = np.float32
137 |             elif array.dtype == 'int64' or array.dtype == 'int32':
138 |                 tensor_type = np.int64
139 |             else:
140 |                 tensor_type = np.str
141 |             pred_onx = self._model.run([label_name], {input_name: array.astype(tensor_type)})[0]
142 |             return [prediction for plist in pred_onx for prediction in plist] if isinstance(pred_onx[0], list) \
143 |                 else [prediction for prediction in pred_onx]
144 |         else:
145 |             return self._model.predict(dataset)
146 | 
147 |     # TODO: to be improved, make it more flexible
148 |     @staticmethod
149 |     def get_initial_types(dataset: pd.DataFrame | np.ndarray) -> list[tuple[str, DataType]]:
150 |         array = dataset.to_numpy() if isinstance(dataset, pd.DataFrame) else dataset
151 |         name = ''
152 |         for column in dataset.columns:
153 |             name += column + ', '
154 |         name = name[:-2]
155 |         shape = [None, array.shape[1]]
156 |         if array.dtype == 'float64':
157 |             types = FloatTensorType(shape)
158 |         elif array.dtype == 'int64':
159 |             types = Int64TensorType(shape)
160 |         else:
161 |             types = StringTensorType(shape)
162 |         return [(name, types)]
163 | 


--------------------------------------------------------------------------------
/test/psyke/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | from sklearn.model_selection import train_test_split
 3 | from tuprolog.solve.prolog import prolog_solver
 4 | from psyke.extraction.hypercubic import Grid, FeatureRanker
 5 | from psyke.utils.dataframe import get_discrete_dataset
 6 | from psyke.utils.logic import data_to_struct, get_in_rule, get_not_in_rule
 7 | from psyke.extraction.hypercubic.strategy import AdaptiveStrategy, FixedStrategy
 8 | from test import get_dataset, get_extractor, get_schema, get_model
 9 | from test.resources.tests import test_cases
10 | from tuprolog.theory import Theory, mutable_theory
11 | from tuprolog.theory.parsing import parse_theory
12 | from typing import Callable
13 | import ast
14 | import numpy as np
15 | from psyke import get_default_random_seed
16 | 
17 | 
18 | def initialize(file: str) -> list[dict[str:Theory]]:
19 |     for row in test_cases(file):
20 |         params = dict() if row['extractor_params'] == '' else ast.literal_eval(row['extractor_params'])
21 |         dataset = get_dataset(row['dataset'])
22 | 
23 |         training_set, test_set = train_test_split(dataset, test_size=0.05 if row['dataset'].lower() == 'house' else 0.5,
24 |                                                   random_state=get_default_random_seed())
25 | 
26 |         schema, test_set_for_predictor = None, test_set
27 |         if 'disc' in row.keys() and bool(row['disc']):
28 |             schema = get_schema(training_set)
29 |             params['discretization'] = schema
30 |             training_set = get_discrete_dataset(training_set.iloc[:, :-1], schema) \
31 |                 .join(training_set.iloc[:, -1].reset_index(drop=True))
32 |             test_set_for_predictor = get_discrete_dataset(test_set.iloc[:, :-1], schema) \
33 |                 .join(test_set.iloc[:, -1].reset_index(drop=True))
34 | 
35 |         # Handle Cart tests.
36 |         # Cart needs to inspect the tree of the predictor.
37 |         # Unfortunately onnx does not provide a method to do that.
38 |         #if row['predictor'].lower() not in ['dtc', 'dtr']:
39 |         #    params['predictor'] = Predictor.load_from_onnx(str(get_predictor_path(row['predictor'])))
40 |         #else:
41 |         predictor, fitted = get_model(row['predictor'], {})
42 |         if not fitted:
43 |             predictor.fit(training_set.iloc[:, :-1], training_set.iloc[:, -1])
44 |         params['predictor'] = predictor
45 | 
46 |         # Handle GridEx tests
47 |         # TODO: this is algorithm specific therefore it should be handled inside the algorithm itself.
48 |         if 'grid' in row.keys() and bool:
49 |             strategy, n = eval(row['strategies'])
50 |             if strategy == "F":
51 |                 params['grid'] = Grid(int(row['grid']), FixedStrategy(n))
52 |             else:
53 |                 ranked = FeatureRanker(training_set.columns[:-1]) \
54 |                     .fit(params['predictor'], training_set.iloc[:, :-1]).rankings()
55 |                 params['grid'] = Grid(int(row['grid']), AdaptiveStrategy(ranked, n))
56 | 
57 |         extractor = get_extractor(row['extractor_type'], params)
58 |         theory = extractor.extract(training_set)
59 | 
60 |         # Compute predictions from rules
61 |         index = test_set.shape[1] - 1
62 | 
63 |         cast, substitutions = get_substitutions(test_set, theory)
64 |         expected = [cast(query.solved_query.get_arg_at(index)) for query in substitutions if query.is_yes]
65 |         predictions = [prediction for prediction in extractor.predict(test_set_for_predictor.iloc[:, :-1])
66 |                        if prediction is not None]
67 | 
68 |         yield {
69 |             'extractor': extractor,
70 |             'extracted_theory': theory,
71 |             'extracted_test_y_from_theory': np.array(expected),
72 |             'extracted_test_y_from_extractor': np.array(predictions),
73 |             'test_set': test_set,
74 |             'expected_theory': parse_theory(row['theory'] + '.') if row['theory'] != '' else None,
75 |             'discretization': schema
76 |         }
77 | 
78 | 
79 | def get_substitutions(test_set, theory):
80 |     cast: Callable = lambda x: (str(x) if isinstance(test_set.iloc[0, -1], str) else float(x.value))
81 |     solver = prolog_solver(static_kb=mutable_theory(theory).assertZ(get_in_rule()).assertZ(get_not_in_rule()))
82 |     substitutions = [solver.solveOnce(data_to_struct(data)) for _, data in test_set.iterrows()]
83 |     return cast, substitutions
84 | 


--------------------------------------------------------------------------------
/test/psyke/clustering/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/psykei/psyke-python/1e4ce34679d34abde6f7545a070aab99d3b053bb/test/psyke/clustering/__init__.py


--------------------------------------------------------------------------------
/test/psyke/extraction/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/test/psyke/extraction/cart/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/test/psyke/extraction/cart/test_cart.py:
--------------------------------------------------------------------------------
 1 | from parameterized import parameterized_class
 2 | from psyke.utils import get_default_precision
 3 | from psyke import logger
 4 | from test.psyke import initialize
 5 | import unittest
 6 | 
 7 | """ 
 8 |     TODO (?): right now there is a small chance that corner data are wrongly predicted (that is fine for now).
 9 |     In other words, if we use the extracted rules (with a specific default accuracy fo float)
10 |     and compare their result with the one obtained by the actual decision tree (thresholds do not have truncated float)
11 |     they may be different. To avoid this, when we will refactor all extractor we will also address this issue.
12 | """
13 | 
14 | 
15 | @parameterized_class(initialize('cart'))
16 | class TestCart(unittest.TestCase):
17 | 
18 |     def test_extract(self):
19 |         logger.info(self.expected_theory)
20 |         logger.info(self.extracted_theory)
21 |         self.assertTrue(self.expected_theory.equals(self.extracted_theory, False))
22 | 
23 |     def test_predict(self):
24 |         # self.assertEqual(self.extracted_test_y_from_theory, self.extracted_test_y_from_pruned_theory)
25 |         if isinstance(self.extracted_test_y_from_theory[0], str):
26 |             self.assertTrue(all(self.extracted_test_y_from_theory == self.extracted_test_y_from_extractor))
27 |         else:
28 |             self.assertTrue(max(abs(self.extracted_test_y_from_theory - self.extracted_test_y_from_extractor)) <
29 |                             get_default_precision())
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     unittest.main()
34 | 


--------------------------------------------------------------------------------
/test/psyke/extraction/cart/test_simplified_cart.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable
 2 | 
 3 | import numpy as np
 4 | from parameterized import parameterized_class
 5 | from sklearn.model_selection import train_test_split
 6 | from tuprolog.solve.prolog import prolog_solver
 7 | from tuprolog.theory import mutable_theory
 8 | 
 9 | from psyke import Extractor
10 | from psyke.utils import get_default_precision
11 | from psyke.utils.logic import data_to_struct, get_in_rule, get_not_in_rule
12 | from test import get_dataset, get_model
13 | import unittest
14 | 
15 | 
16 | # TODO: should be refactored using the a .csv file
17 | from test.psyke import get_substitutions
18 | 
19 | 
20 | @parameterized_class([{"dataset": "iris", "predictor": "DTC", "task": "extraction"},
21 |                       {"dataset": "house", "predictor": "DTR", "task": "hypercubic"}])
22 | class TestSimplifiedCart(unittest.TestCase):
23 | 
24 |     def test_equality(self):
25 |         dataset = get_dataset(self.dataset)
26 |         dataset = dataset.reindex(sorted(dataset.columns[:-1]) + [dataset.columns[-1]], axis=1)
27 |         train, test = train_test_split(dataset, test_size=0.5)
28 |         tree, _ = get_model(self.predictor, {})
29 |         tree.fit(train.iloc[:, :-1], train.iloc[:, -1])
30 |         extractor = Extractor.cart(tree, simplify=False)
31 |         theory = extractor.extract(train)
32 |         simplified_extractor = Extractor.cart(tree)
33 |         simplified_theory = simplified_extractor.extract(train)
34 | 
35 |         index = test.shape[1] - 1
36 |         cast, substitutions = get_substitutions(test, theory)
37 |         expected = [cast(query.solved_query.get_arg_at(index)) for query in substitutions]
38 | 
39 |         cast, simplified_substitutions = get_substitutions(test, simplified_theory)
40 |         simplified_expected = [cast(query.solved_query.get_arg_at(index)) for query in simplified_substitutions]
41 | 
42 |         if isinstance(test.iloc[0, -1], str):
43 |             self.assertTrue(all(np.array(extractor.predict(test.iloc[:, :-1])) ==
44 |                                 np.array(simplified_extractor.predict(test.iloc[:, :-1]))))
45 |             self.assertEqual(expected, simplified_expected)
46 |         else:
47 |             self.assertTrue(max(abs(np.array(extractor.predict(test.iloc[:, :-1])) -
48 |                                     np.array(simplified_extractor.predict(test.iloc[:, :-1])))
49 |                                 ) < get_default_precision())
50 |             self.assertTrue(max(abs(np.array(expected) - np.array(simplified_expected))) < get_default_precision())
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     unittest.main()
55 | 


--------------------------------------------------------------------------------
/test/psyke/extraction/hypercubic/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/test/psyke/extraction/hypercubic/gridex/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/test/psyke/extraction/hypercubic/gridex/test_gridex.py:
--------------------------------------------------------------------------------
 1 | from psyke import logger
 2 | from parameterized import parameterized_class
 3 | from test.psyke import initialize
 4 | import unittest
 5 | 
 6 | 
 7 | @parameterized_class(initialize('gridex'))
 8 | class TestGridEx(unittest.TestCase):
 9 | 
10 |     def test_extract(self):
11 |         logger.info(self.expected_theory)
12 |         logger.info(self.extracted_theory)
13 |         # This test does not pass the ci, however it is not clear to me why (local ok). Could it be non-deterministic?
14 |         # self.assertTrue(self.expected_theory.equals(self.extracted_theory, False))
15 | 
16 |     def test_predict(self):
17 |         if isinstance(self.extracted_test_y_from_theory[0], str):
18 |             self.assertTrue(all(self.extracted_test_y_from_theory == self.extracted_test_y_from_extractor))
19 |         else:
20 |             # TODO: check this!
21 |             self.assertTrue(max(abs(self.extracted_test_y_from_theory - self.extracted_test_y_from_extractor)) < 0.05)
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     unittest.main()
26 | 


--------------------------------------------------------------------------------
/test/psyke/extraction/hypercubic/iter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/psykei/psyke-python/1e4ce34679d34abde6f7545a070aab99d3b053bb/test/psyke/extraction/hypercubic/iter/__init__.py


--------------------------------------------------------------------------------
/test/psyke/extraction/hypercubic/iter/test_iter.py:
--------------------------------------------------------------------------------
 1 | from psyke import logger
 2 | from parameterized import parameterized_class
 3 | from psyke.utils import get_default_precision
 4 | from test.psyke import initialize
 5 | import unittest
 6 | 
 7 | 
 8 | @parameterized_class(initialize('iter'))
 9 | class TestIter(unittest.TestCase):
10 | 
11 |     def test_extract(self):
12 |         logger.info(self.expected_theory)
13 |         logger.info(self.extracted_theory)
14 |         self.assertTrue(self.expected_theory.equals(self.extracted_theory, False))
15 | 
16 |     def test_predict(self):
17 |         if isinstance(self.extracted_test_y_from_theory[0], str):
18 |             self.assertTrue(all(self.extracted_test_y_from_theory == self.extracted_test_y_from_extractor))
19 |         else:
20 |             self.assertTrue(max(abs(self.extracted_test_y_from_theory - self.extracted_test_y_from_extractor)) <
21 |                             get_default_precision())
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     unittest.main()
26 | 


--------------------------------------------------------------------------------
/test/psyke/extraction/real/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/test/psyke/extraction/real/test_real.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from parameterized import parameterized_class
 3 | from psyke import logger
 4 | from test.psyke import initialize
 5 | 
 6 | 
 7 | @parameterized_class(initialize('real'))
 8 | class TestReal(unittest.TestCase):
 9 | 
10 |     def test_extract(self):
11 |         logger.info(self.expected_theory)
12 |         logger.info(self.extracted_theory)
13 |         self.assertTrue(self.expected_theory.equals(self.extracted_theory, False))
14 | 
15 |     def test_predict(self):
16 |         self.assertTrue(all(self.extracted_test_y_from_theory == self.extracted_test_y_from_extractor))
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     unittest.main()
21 | 


--------------------------------------------------------------------------------
/test/psyke/extraction/real/test_rule.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from psyke.extraction.real.utils import Rule
 3 | from psyke.utils.dataframe import split_features
 4 | from test import get_dataset
 5 | 
 6 | 
 7 | class TestRule(unittest.TestCase):
 8 | 
 9 |     def test_subrule(self):
10 |         pred_1, pred_2 = ['V1', 'V2'], ['V3', 'V4']
11 |         rule_1 = Rule(pred_1, pred_2)
12 |         self.assertTrue(rule_1 in rule_1)
13 |         rule_2 = Rule(pred_2, pred_1)
14 |         self.assertFalse(rule_1 in rule_2)
15 |         self.assertFalse(rule_2 in rule_1)
16 |         rule_3 = Rule(['V1'], ['V3'])
17 |         self.assertTrue(rule_1 in rule_3)
18 |         self.assertFalse(rule_3 in rule_1)
19 |         self.assertFalse(rule_2 in rule_3)
20 |         self.assertFalse(rule_3 in rule_2)
21 |         rule_4 = Rule(["V1"], ["V5"])
22 |         self.assertFalse(rule_1 in rule_4)
23 |         self.assertFalse(rule_4 in rule_1)
24 |         rule_5 = Rule(["V1", "V6"], ["V3", "V4"])
25 |         self.assertFalse(rule_1 in rule_5)
26 |         self.assertFalse(rule_5 in rule_1)
27 |         self.assertTrue(rule_1 in Rule([], []))
28 | 
29 |     def test_reduce(self):
30 |         dataset = get_dataset('iris')
31 |         features = split_features(dataset)
32 |         rule = Rule(["V1_1", "V2_2", "V3_0"],
33 |                     ["V1_0", "V2_1", "V2_0", "V4_1", "V4_2"])
34 |         reduced_rule = Rule(["V1_1", "V2_2", "V3_0"],
35 |                             ["V4_1", "V4_2"])
36 |         self.assertEqual(reduced_rule.true_predicates, rule.reduce(features).true_predicates)
37 |         self.assertEqual(reduced_rule.false_predicates, rule.reduce(features).false_predicates)
38 |         self.assertEqual(reduced_rule.true_predicates, reduced_rule.reduce(features).true_predicates)
39 |         self.assertEqual(reduced_rule.false_predicates, reduced_rule.reduce(features).false_predicates)
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     unittest.main()


--------------------------------------------------------------------------------
/test/psyke/extraction/trepan/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/test/psyke/extraction/trepan/test_node.py:
--------------------------------------------------------------------------------
 1 | from psyke.extraction.trepan import Node
 2 | from test import get_dataset
 3 | import pandas as pd
 4 | import unittest
 5 | 
 6 | 
 7 | class TestNode(unittest.TestCase):
 8 | 
 9 |     dataset: pd.DataFrame = get_dataset('iris')
10 |     n_examples = dataset.shape[0]
11 |     all_node = Node(dataset, n_examples)
12 |     setosa_40 = Node(dataset.iloc[10:70, :], n_examples)
13 |     virginica_10 = Node(dataset.iloc[95:110, :], n_examples)
14 |     versicolor_50 = Node(dataset.iloc[20:130, :], n_examples)
15 | 
16 |     def test_reach(self):
17 |         node = Node(self.dataset, self.n_examples)
18 |         self.assertEqual(node.reach, self.all_node.reach)
19 |         self.assertTrue(self.virginica_10.reach < self.setosa_40.reach)
20 |         self.assertTrue(self.setosa_40.reach < self.versicolor_50.reach)
21 |         self.assertTrue(self.versicolor_50.reach < self.all_node.reach)
22 | 
23 |     def test_dominant(self):
24 |         self.assertEqual('setosa', self.setosa_40.dominant)
25 |         self.assertEqual('virginica', self.virginica_10.dominant)
26 |         self.assertEqual('versicolor', self.versicolor_50.dominant)
27 | 
28 |     def test_correct(self):
29 |         self.assertEqual(50, self.versicolor_50.correct)
30 |         self.assertEqual(40, self.setosa_40.correct)
31 |         self.assertEqual(10, self.virginica_10.correct)
32 | 
33 |     def test_fidelity(self):
34 |         self.assertEqual(50 / 150, self.all_node.fidelity)
35 |         self.assertEqual(40 / 60, self.setosa_40.fidelity)
36 |         self.assertEqual(10 / 15, self.virginica_10.fidelity)
37 |         self.assertEqual(50 / 110, self.versicolor_50.fidelity)
38 | 
39 |     def test_priority(self):
40 |         self.assertTrue(self.all_node.priority < self.versicolor_50.priority)
41 |         self.assertTrue(self.versicolor_50.priority < self.setosa_40.priority)
42 |         self.assertTrue(self.setosa_40.priority < self.virginica_10.priority)
43 | 
44 |     def test_n_classes(self):
45 |         self.assertEqual(3, self.all_node.n_classes)
46 |         self.assertEqual(2, self.virginica_10.n_classes)
47 |         self.assertEqual(2, self.setosa_40.n_classes)
48 |         self.assertEqual(3, self.versicolor_50.n_classes)
49 |         self.assertEqual(1, Node(self.dataset.iloc[15:40, :], self.n_examples).n_classes)
50 | 
51 |     def test_iterator(self):
52 |         node = Node(self.dataset, self.n_examples)
53 |         child_1 = Node(self.dataset.iloc[:50, :], self.n_examples)
54 |         child_2 = Node(self.dataset.iloc[50:150, :], self.n_examples)
55 |         node.children = [child_1, child_2]
56 |         grandchild_1_1 = Node(self.dataset.iloc[:25, :], self.n_examples)
57 |         grandchild_2_1 = Node(self.dataset.iloc[50:80, :], self.n_examples)
58 |         grandchild_2_2 = Node(self.dataset.iloc[80:120, :], self.n_examples)
59 |         child_1.children = [grandchild_1_1]
60 |         child_2.children = [grandchild_2_1, grandchild_2_2]
61 |         self.assertEqual(list(node), list(child_1) + list(child_2) + [node])
62 |         self.assertEqual([grandchild_1_1, child_1, grandchild_2_1, grandchild_2_2, child_2, node], list(node))
63 | 
64 |     def test_to_string(self):
65 |         node = Node(self.dataset, self.n_examples, (('V1', 0.0), ('V2', 1.0)))
66 |         self.assertEqual(' = setosa', str(self.all_node))
67 |         self.assertEqual(' = setosa', str(self.setosa_40))
68 |         self.assertEqual(' = versicolor', str(self.versicolor_50))
69 |         self.assertEqual(' = virginica', str(self.virginica_10))
70 |         self.assertEqual('!V1, V2 = setosa', str(node))
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     unittest.main()
75 | 


--------------------------------------------------------------------------------
/test/psyke/extraction/trepan/test_split.py:
--------------------------------------------------------------------------------
 1 | from psyke.extraction.trepan import Node, Split
 2 | from test import get_dataset
 3 | import math
 4 | import pandas as pd
 5 | import unittest
 6 | 
 7 | 
 8 | class TestSplit(unittest.TestCase):
 9 | 
10 |     dataset: pd.DataFrame = get_dataset('iris')
11 |     n_examples = dataset.shape[0]
12 |     all_node = Node(dataset, n_examples)
13 |     setosa_40 = Node(dataset.iloc[10:70, :], n_examples)
14 |     setosa_40_complementar = Node(pd.concat([dataset.iloc[:10, :], dataset.iloc[70:, :]]), n_examples)
15 |     versicolor_25 = Node(dataset.iloc[40:75, :], n_examples)
16 |     versicolor_25_complementar = Node(dataset.iloc[75:110, :], n_examples)
17 | 
18 |     def test_priority(self):
19 |         self.assertTrue(math.isclose(-40/60-50/90-100,
20 |                                      Split(self.all_node, (self.setosa_40, self.setosa_40_complementar)).priority))
21 |         self.assertTrue(math.isclose((25 / 35) * - 2 - 200 + 200,
22 |                                      Split(self.all_node, (self.versicolor_25, self.versicolor_25_complementar))
23 |                                      .priority))
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     unittest.main()
28 | 


--------------------------------------------------------------------------------
/test/psyke/extraction/trepan/test_trepan.py:
--------------------------------------------------------------------------------
 1 | from cmath import isclose
 2 | from parameterized import parameterized_class
 3 | from psyke import logger
 4 | from psyke.utils import get_default_precision
 5 | from psyke.utils.logic import pretty_theory
 6 | from test.psyke import initialize
 7 | import unittest
 8 | 
 9 | 
10 | @parameterized_class(initialize('trepan'))
11 | class TestTrepan(unittest.TestCase):
12 | 
13 |     def test_extract(self):
14 |         logger.info(pretty_theory(self.expected_theory) + '\n')
15 |         logger.info(pretty_theory(self.extracted_theory) + '\n')
16 |         self.assertTrue(self.expected_theory.equals(self.extracted_theory, False))
17 | 
18 |     def test_predict(self):
19 |         if isinstance(self.extracted_test_y_from_theory[0], str):
20 |             self.assertTrue(all(self.extracted_test_y_from_theory == self.extracted_test_y_from_extractor))
21 |         else:
22 |             self.assertTrue(max(abs(self.extracted_test_y_from_theory - self.extracted_test_y_from_extractor)) <
23 |                             get_default_precision())
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     unittest.main()
28 | 


--------------------------------------------------------------------------------
/test/psyke/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/psykei/psyke-python/1e4ce34679d34abde6f7545a070aab99d3b053bb/test/psyke/utils/__init__.py


--------------------------------------------------------------------------------
/test/psyke/utils/test_prune.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from tuprolog.theory import mutable_theory, theory
 3 | from tuprolog.theory.parsing import parse_theory
 4 | from psyke.utils.logic import prune
 5 | 
 6 | 
 7 | class TestPrune(unittest.TestCase):
 8 | 
 9 |     def test_prune_documentation(self):
10 |         theory1 = "c(A, B, C, D, positive) :- ('=<'(A, 1), '>'(B, 2), '=='(C, 0)). " \
11 |                   + "c(A, B, C, D, positive) :- ('=<'(A, 1), '>'(B, 2))."
12 |         pruned1 = "c(A, B, C, D, positive) :- ('=<'(A, 1), '>'(B, 2))."
13 | 
14 |         theory2 = "c(A, B, C, D, positive) :- ('=<'(A, 1), '>'(B, 2), '=='(C, 0)). " \
15 |                   + "c(A, B, C, D, positive) :- ('=<'(A, 1.3), '>'(B, 1.8))."
16 |         pruned2 = "c(A, B, C, D, positive) :- ('=<'(A, 1.3), '>'(B, 1.8))."
17 | 
18 |         theory3 = "c(A, B, C, D, positive) :- ('=<'(A, 1.3), '>'(B, 1.8)). " \
19 |                   + "c(A, B, C, D, positive) :- ('=<'(A, 1), '>'(B, 2), '=='(C, 0))."
20 |         pruned3 = pruned2
21 | 
22 |         self.assertTrue(theory(parse_theory(pruned1)).equals(prune(mutable_theory(parse_theory(theory1))), False))
23 |         self.assertTrue(theory(parse_theory(pruned2)).equals(prune(mutable_theory(parse_theory(theory2))), False))
24 |         self.assertTrue(theory(parse_theory(pruned3)).equals(prune(mutable_theory(parse_theory(theory3))), False))
25 | 
26 |     def test_prune_success(self):
27 |         textual_theory = "p(X, Y, inside) :- ('=<'(X, 1), '>'(Y, 2)). " \
28 |                          + "p(X, Y, inside) :- ('=<'(X, 0.5), '>'(Y, 3))."
29 |         textual_pruned_theory = "p(X, Y, inside) :- ('=<'(X, 1), '>'(Y, 2))."
30 |         long_theory = mutable_theory(parse_theory(textual_theory))
31 |         pruned_theory = theory(parse_theory(textual_pruned_theory))
32 | 
33 |         self.assertTrue(pruned_theory.equals(prune(long_theory), False))
34 | 
35 |     def test_prune_not_applied(self):
36 |         textual_theory = "p(PL, PW, SL, SW, versicolor) :-  '=<'(SW, 3.6). " \
37 |                          + "p(PL, PW, SL, SW, versicolor) :- ('=<'(PW, 0.35), '=<'(SL, 5.35), '=<'(SW, 3.9))."
38 |         textual_pruned_theory = textual_theory
39 |         long_theory = mutable_theory(parse_theory(textual_theory))
40 |         pruned_theory = theory(parse_theory(textual_pruned_theory))
41 | 
42 |         self.assertTrue(pruned_theory.equals(prune(long_theory), False))
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/test/psyke/utils/test_simplify.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from tuprolog.theory import mutable_theory, theory
 3 | from tuprolog.theory.parsing import parse_theory
 4 | from psyke.utils.logic import simplify
 5 | 
 6 | 
 7 | class TestSimplify(unittest.TestCase):
 8 | 
 9 |     def test_simplify(self):
10 |         # TODO: if numbers are not float equals method return false (e.g., 2 instead of 2.0). @Giovanni 2ppy
11 |         textual_theory = "p(X, Y, inside) :- ('=<'(X, 1.0), '>'(Y, 2.0), '=<'(X, 0.5))."
12 |         textual_simplified_theory = "p(X, Y, inside) :- ('=<'(X, 0.5), '>'(Y, 2.0))."
13 |         long_theory = mutable_theory(parse_theory(textual_theory))
14 |         simplified_theory = theory(parse_theory(textual_simplified_theory))
15 | 
16 |         self.assertTrue(simplified_theory.equals(simplify(long_theory), False))
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     unittest.main()
21 | 


--------------------------------------------------------------------------------
/test/psyke/utils/test_simplify_formatter.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from sklearn.model_selection import train_test_split
 3 | from sklearn.tree import DecisionTreeRegressor
 4 | from psyke import Extractor, get_default_random_seed
 5 | from psyke.extraction.hypercubic import Grid
 6 | from test import get_dataset
 7 | 
 8 | 
 9 | class TestSimplifyFormatter(unittest.TestCase):
10 | 
11 |     def test_simplify_formatter(self):
12 |         data = get_dataset('house')
13 |         train, test = train_test_split(data, test_size=0.5, random_state=get_default_random_seed())
14 |         predictor = DecisionTreeRegressor()
15 |         predictor.fit(train.iloc[:, :-1], train.iloc[:, -1])
16 |         extractor = Extractor.gridrex(predictor, Grid())
17 |         theory = extractor.extract(train)
18 |         # print(pretty_theory(theory))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     unittest.main()
23 | 


--------------------------------------------------------------------------------