├── .github ├── scripts │ └── retry.sh └── workflows │ ├── check.yml │ ├── deploy.yml │ ├── dockerify.yml │ └── todos.yml ├── .gitignore ├── .gitmodules ├── .img ├── logo-wide.png ├── logo-wide.svg ├── logo.png └── logo.svg ├── .python-version ├── CITATION.md ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── psyke ├── __init__.py ├── clustering │ ├── __init__.py │ ├── cream │ │ └── __init__.py │ ├── exact │ │ └── __init__.py │ └── utils.py ├── extraction │ ├── __init__.py │ ├── cart │ │ ├── CartPredictor.py │ │ ├── FairTree.py │ │ ├── FairTreePredictor.py │ │ └── __init__.py │ ├── hypercubic │ │ ├── __init__.py │ │ ├── cosmik │ │ │ └── __init__.py │ │ ├── creepy │ │ │ └── __init__.py │ │ ├── divine │ │ │ └── __init__.py │ │ ├── gridex │ │ │ └── __init__.py │ │ ├── gridrex │ │ │ └── __init__.py │ │ ├── hex │ │ │ └── __init__.py │ │ ├── hypercube.py │ │ ├── iter │ │ │ └── __init__.py │ │ ├── strategy.py │ │ └── utils.py │ ├── real │ │ ├── __init__.py │ │ └── utils.py │ └── trepan │ │ ├── __init__.py │ │ └── utils.py ├── hypercubepredictor.py ├── schema │ └── __init__.py ├── tuning │ ├── __init__.py │ ├── crash │ │ └── __init__.py │ ├── orchid │ │ └── __init__.py │ └── pedro │ │ └── __init__.py └── utils │ ├── __init__.py │ ├── dataframe.py │ ├── logic.py │ ├── metrics.py │ ├── plot.py │ └── sorted.py ├── pyproject.toml ├── renovate.json ├── requirements.txt ├── setup.py └── test ├── __init__.py └── psyke ├── __init__.py ├── clustering └── __init__.py ├── extraction ├── __init__.py ├── cart │ ├── __init__.py │ ├── test_cart.py │ └── test_simplified_cart.py ├── hypercubic │ ├── __init__.py │ ├── gridex │ │ ├── __init__.py │ │ └── test_gridex.py │ ├── iter │ │ ├── __init__.py │ │ └── test_iter.py │ └── test_hypercube.py ├── real │ ├── __init__.py │ ├── test_real.py │ └── test_rule.py └── trepan │ ├── __init__.py │ ├── test_node.py │ ├── test_split.py │ └── test_trepan.py └── utils ├── __init__.py ├── test_prune.py ├── test_simplify.py └── test_simplify_formatter.py /.github/scripts/retry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DT=${2:-${RETRY_TIME:-5m}} 4 | MAX=${3:-${MAX_RETRIES:-3}} 5 | 6 | for N in `seq 1 $MAX`; do 7 | echo "Attempt $N/$MAX: $1" 8 | eval $1; 9 | RESULT=$? 10 | if [[ $RESULT -eq 0 ]]; then 11 | exit 0 12 | fi 13 | if [[ $N -lt $MAX ]]; then 14 | echo "Failed attempt $N/$MAX. Waiting $DT" 15 | sleep $DT 16 | else 17 | echo "Failed attempt $N/$MAX." 18 | exit $RESULT 19 | fi 20 | done 21 | -------------------------------------------------------------------------------- /.github/workflows/check.yml: -------------------------------------------------------------------------------- 1 | name: check 2 | on: 3 | push: 4 | tags: '*' 5 | branches-ignore: 6 | - 'autodelivery**' 7 | - 'bump-**' 8 | - 'dependabot/**' 9 | paths-ignore: 10 | - 'CHANGELOG.md' 11 | - 'renovate.json' 12 | - '.gitignore' 13 | pull_request: 14 | workflow_dispatch: 15 | env: 16 | PROJECT_NAME: psyke-python 17 | WORKFLOW: check 18 | TEST_SUBMODULE: psykei/psyke-pytest 19 | jobs: 20 | create-test-predictors-if-needed: 21 | runs-on: ubuntu-latest 22 | name: Create test predictors if needed 23 | # TODO: short circuit job as soon as it's possible: 24 | # https://github.com/actions/runner/issues/662 25 | # if: ${{ github.repository == 'psykei/psyke-python' }} 26 | steps: 27 | - name: Checkout code 28 | if: ${{ github.repository == 'psykei/psyke-python' }} 29 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 30 | with: 31 | fetch-depth: 0 32 | submodules: recursive 33 | 34 | - name: Get Python Version 35 | if: ${{ github.repository == 'psykei/psyke-python' }} 36 | id: get-python-version 37 | run: echo ::set-output name=version::$(cat .python-version) 38 | 39 | - name: Setup Python 40 | if: ${{ github.repository == 'psykei/psyke-python' }} 41 | uses: actions/setup-python@v5 42 | with: 43 | python-version: ${{ steps.get-python-version.outputs.version }} 44 | 45 | - name: Restore Python dependencies 46 | if: ${{ github.repository == 'psykei/psyke-python' }} 47 | run: pip install -r requirements.txt 48 | 49 | # - name: Create missing predictors 50 | # if: ${{ github.repository == 'psykei/psyke-python' }} 51 | # run: python setup.py create_test_predictors 52 | 53 | - name: Submodule update 54 | if: ${{ github.repository == 'psykei/psyke-python' }} 55 | run: | 56 | pushd test/resources 57 | git config user.email "bot@noreply.github.com" 58 | git config user.name "CI bot" 59 | git remote set-url origin https://x-access-token:${{ secrets.TRIGGER_GITHUB_ACTION }}@github.com/${{ env.TEST_SUBMODULE }} 60 | (git add predictors/*.onnx tests/*.csv datasets/*.csv) || echo 'nothing to add' 61 | (git commit -m 'predictors update from workflows') || echo 'nothing to commit' 62 | (git push) || echo 'nothing to push' 63 | run-unit-tests: 64 | strategy: 65 | fail-fast: false 66 | matrix: 67 | os: 68 | - ubuntu-latest 69 | - windows-latest 70 | # - macos-latest 71 | - macos-13 72 | python-version: 73 | - '3.9.12' 74 | runs-on: ${{ matrix.os }} 75 | name: Run tests on Python ${{ matrix.python-version }}, on ${{ matrix.os }} 76 | timeout-minutes: 45 77 | concurrency: 78 | group: ${{ github.workflow }}-run-unit-tests-${{ matrix.python-version }}-${{ matrix.os }}-${{ github.event.number || github.ref }} 79 | cancel-in-progress: true 80 | needs: 81 | - create-test-predictors-if-needed 82 | steps: 83 | - name: Setup Python 84 | uses: actions/setup-python@v5 85 | with: 86 | python-version: ${{ matrix.python-version }} 87 | 88 | - name: Checkout code 89 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 90 | with: 91 | fetch-depth: 0 92 | submodules: recursive 93 | 94 | - name: Restore Python dependencies 95 | run: pip install -r requirements.txt 96 | 97 | - name: Test 98 | run: python -m unittest discover -s test -t . 99 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: deploy 2 | on: 3 | workflow_run: 4 | workflows: 5 | - check 6 | types: 7 | - completed 8 | branches: 9 | - master 10 | - develop 11 | env: 12 | PROJECT_NAME: psyke-python 13 | WORKFLOW: depoly 14 | jobs: 15 | deploy: 16 | runs-on: ubuntu-latest 17 | if: ${{ github.event.workflow_run.conclusion == 'success' }} 18 | name: Deploy on PyPI and create release 19 | steps: 20 | - name: Checkout code 21 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 22 | with: 23 | fetch-depth: 0 # all history 24 | submodules: recursive 25 | 26 | - name: Get All Tags 27 | run: git fetch --tags -f 28 | 29 | - name: Get Python Version 30 | id: get-python-version 31 | run: echo ::set-output name=version::$(cat .python-version) 32 | 33 | - name: Setup Python 34 | uses: actions/setup-python@v5 35 | with: 36 | python-version: ${{ steps.get-python-version.outputs.version }} 37 | 38 | - name: Restore Python dependencies 39 | run: | 40 | pip install -r requirements.txt 41 | 42 | - name: Change default logging level 43 | run: sed -i -e 's/DEBUG/WARN/g' psyke/__init__.py 44 | 45 | - name: Pack 46 | run: python -m build 47 | 48 | - name: Archive Dist Artifacts 49 | if: failure() || success() 50 | uses: actions/upload-artifact@v4 51 | with: 52 | name: dist 53 | path: './dist' 54 | 55 | - name: Upload 56 | run: python -m twine upload dist/* 57 | env: 58 | TWINE_USERNAME: ${{ secrets.PYPI_USERANAME }} 59 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 60 | 61 | - name: Get Version 62 | id: get-version 63 | run: echo ::set-output name=version::$(python setup.py get_project_version | tail -n 1) 64 | 65 | - name: Release Assets 66 | id: upload-release-assets 67 | run: | 68 | set -x 69 | ASSETS=() 70 | for A in dist/*; do 71 | ASSETS+=("-a" "$A") 72 | echo "Releasing $A" 73 | done 74 | RELEASE_TAG='${{ steps.get-version.outputs.version }}' 75 | gh release create "$RELEASE_TAG" 76 | env: 77 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 78 | -------------------------------------------------------------------------------- /.github/workflows/dockerify.yml: -------------------------------------------------------------------------------- 1 | name: dockerify 2 | on: 3 | workflow_run: 4 | workflows: 5 | - deploy 6 | types: 7 | - completed 8 | branches: 9 | - master 10 | - develop 11 | env: 12 | PROJECT_NAME: psyke-python 13 | WORKFLOW: dockerify 14 | RETRY_TIME: 5m 15 | MAX_RETRIES: 3 16 | jobs: 17 | dockerify: 18 | runs-on: ubuntu-latest 19 | if: ${{ github.event.workflow_run.conclusion == 'success' }} 20 | name: Dockerify with Jupyter support 21 | steps: 22 | - name: Docker Login 23 | run: docker login -u ${{ secrets.DOCKERHUB_USERANAME }} -p ${{ secrets.DOCKERHUB_PASSWORD }} 24 | 25 | - name: Checkout code 26 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 27 | with: 28 | fetch-depth: 0 # all history 29 | submodules: recursive 30 | 31 | - name: Get All Tags 32 | run: git fetch --tags -f 33 | 34 | - name: Get Version 35 | id: get-version 36 | run: echo ::set-output name=version::$(python setup.py get_project_version | tail -n 1) 37 | 38 | - name: Create Docker Image 39 | run: ./.github/scripts/retry.sh "docker build -t pikalab/psyke:$PSYKE_VERSION --build-arg PSYKE_VERSION=$PSYKE_VERSION ." 40 | shell: bash 41 | env: 42 | PSYKE_VERSION: '${{ steps.get-version.outputs.version }}' 43 | 44 | - name: Push Image on Docker Hub 45 | run: docker push pikalab/psyke:${{ steps.get-version.outputs.version }} 46 | -------------------------------------------------------------------------------- /.github/workflows/todos.yml: -------------------------------------------------------------------------------- 1 | name: "TODOs finder" 2 | on: 3 | push: 4 | branches-ignore: 5 | - 'autodelivery**' 6 | - 'bump-**' 7 | - 'renovate/**' 8 | - 'dependabot/**' 9 | jobs: 10 | build: 11 | runs-on: "ubuntu-latest" 12 | steps: 13 | - uses: "actions/checkout@master" 14 | - name: "TODO to Issue" 15 | uses: "alstr/todo-to-issue-action@v5.1.12" 16 | id: "todo" 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/python 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python 3 | VERSION 4 | 5 | .idea/ 6 | .vscode/ 7 | 8 | *~ 9 | *.jar 10 | 11 | ### Python ### 12 | # Byte-compiled / optimized / DLL files 13 | __pycache__/ 14 | *.py[cod] 15 | *$py.class 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Distribution / packaging 21 | .Python 22 | build/ 23 | develop-eggs/ 24 | dist/ 25 | downloads/ 26 | eggs/ 27 | .eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | wheels/ 34 | pip-wheel-metadata/ 35 | share/python-wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | MANIFEST 40 | 41 | # PyInstaller 42 | # Usually these files are written by a python script from a template 43 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 44 | *.manifest 45 | *.spec 46 | 47 | # Installer logs 48 | pip-log.txt 49 | pip-delete-this-directory.txt 50 | 51 | # Unit test / coverage reports 52 | htmlcov/ 53 | .tox/ 54 | .nox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *.cover 61 | *.py,cover 62 | .hypothesis/ 63 | .pytest_cache/ 64 | pytestdebug.log 65 | 66 | # Translations 67 | *.mo 68 | *.pot 69 | 70 | # Django stuff: 71 | *.log 72 | local_settings.py 73 | db.sqlite3 74 | db.sqlite3-journal 75 | 76 | # Flask stuff: 77 | instance/ 78 | .webassets-cache 79 | 80 | # Scrapy stuff: 81 | .scrapy 82 | 83 | # Sphinx documentation 84 | docs/_build/ 85 | doc/_build/ 86 | 87 | # PyBuilder 88 | target/ 89 | 90 | # Jupyter Notebook 91 | .ipynb_checkpoints 92 | 93 | # IPython 94 | profile_default/ 95 | ipython_config.py 96 | 97 | # pyenv 98 | # .python-version 99 | 100 | # pipenv 101 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 102 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 103 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 104 | # install all needed dependencies. 105 | #Pipfile.lock 106 | 107 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 108 | __pypackages__/ 109 | 110 | # Celery stuff 111 | celerybeat-schedule 112 | celerybeat.pid 113 | 114 | # SageMath parsed files 115 | *.sage.py 116 | 117 | # Environments 118 | .env 119 | .venv 120 | env/ 121 | venv/ 122 | ENV/ 123 | env.bak/ 124 | venv.bak/ 125 | pythonenv* 126 | 127 | # Spyder project settings 128 | .spyderproject 129 | .spyproject 130 | 131 | # Rope project settings 132 | .ropeproject 133 | 134 | # mkdocs documentation 135 | /site 136 | 137 | # mypy 138 | .mypy_cache/ 139 | .dmypy.json 140 | dmypy.json 141 | 142 | # Pyre type checker 143 | .pyre/ 144 | 145 | # pytype static type analyzer 146 | .pytype/ 147 | 148 | # profiling data 149 | .prof 150 | 151 | # End of https://www.toptal.com/developers/gitignore/api/python 152 | 153 | # macOS stuff 154 | .DS_store 155 | 156 | # File ONNX 157 | *.onnx 158 | 159 | # Local stuff 160 | dummy/ 161 | tmp_model/ 162 | plots/ 163 | demo/ 164 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "test/resources"] 2 | path = test/resources 3 | url = https://github.com/psykei/psyke-pytest.git 4 | -------------------------------------------------------------------------------- /.img/logo-wide.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/psykei/psyke-python/1e4ce34679d34abde6f7545a070aab99d3b053bb/.img/logo-wide.png -------------------------------------------------------------------------------- /.img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/psykei/psyke-python/1e4ce34679d34abde6f7545a070aab99d3b053bb/.img/logo.png -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.9.23 2 | -------------------------------------------------------------------------------- /CITATION.md: -------------------------------------------------------------------------------- 1 | To cite PSyKE in publications, please use: 2 | 3 | > Federico Sabbatini, Giovanni Ciatto, Roberta Calegari, Andrea Omicini. "[On the Design of PSyKE: A Platform for Symbolic Knowledge Extraction](http://ceur-ws.org/Vol-2963/paper14.pdf)", in: WOA 2021 – 22nd Workshop “From Objects to Agents”, Aachen, Sun SITE Central Europe, RWTH Aachen University, 2021, 2963, pp. 29 - 48. 4 | 5 | A BibTeX entry for LaTeX users is: 6 | ```bibtex 7 | @inproceedings{psyke-woa2021, 8 | articleno = 3, 9 | author = {Sabbatini, Federico and Ciatto, Giovanni and Calegari, Roberta and Omicini, Andrea}, 10 | booktitle = {WOA 2021 -- 22nd Workshop ``From Objects to Agents''}, 11 | editor = {Calegari, Roberta and Ciatto, Giovanni and Denti, Enrico and Omicini, Andrea and Sartor, Giovanni}, 12 | issn = {1613-0073}, 13 | keywords = {explainable AI, knowledge extraction, interpretable prediction, PSyKE}, 14 | location = {Bologna, Italy}, 15 | month = oct, 16 | note = {22nd Workshop ``From Objects to Agents'' (WOA 2021), Bologna, Italy, 1--3~} # sep # {~2021. Proceedings}, 17 | numpages = 20, 18 | pages = {29--48}, 19 | publisher = {Sun SITE Central Europe, RWTH Aachen University}, 20 | series = {CEUR Workshop Proceedings}, 21 | subseries = {AI*IA Series}, 22 | title = {On the Design of {PSyKE}: A Platform for Symbolic Knowledge Extraction}, 23 | url = {http://ceur-ws.org/Vol-2963/paper14.pdf}, 24 | volume = 2963, 25 | year = 2021 26 | } 27 | ``` 28 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | ARG PSYKE_VERSION 3 | EXPOSE 8888 4 | RUN apt update; apt install -y -q openjdk-17-jdk 5 | RUN pip install jupyter 6 | RUN pip install psyke==$PSYKE_VERSION 7 | RUN mkdir -p /root/.jupyter 8 | ENV JUPYTER_CONF_FILE /root/.jupyter/jupyter_notebook_config.py 9 | RUN echo "c.NotebookApp.allow_origin = '*'" > $JUPYTER_CONF_FILE 10 | RUN echo "c.NotebookApp.ip = '0.0.0.0'" >> $JUPYTER_CONF_FILE 11 | RUN mkdir -p /notebook 12 | COPY test/resources/datasets/*.csv /notebook/datasets/ 13 | WORKDIR /notebook 14 | CMD jupyter notebook --allow-root --no-browser 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2021 PIKA-lab / eXplanable AI 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include VERSION 2 | exclude test/* 3 | exclude demo/* 4 | exclude main.py 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PSyKE 2 | 3 | ![PSyKE Logo](.img/logo-wide.png) 4 | 5 | Some quick links: 6 | * [Home Page](https://apice.unibo.it/xwiki/bin/view/PSyKE/) 7 | * [GitHub Repository](https://github.com/psykei/psyke-python) 8 | * [PyPi Repository](https://pypi.org/project/psyke/) 9 | * [Issues](https://github.com/psykei/psyke-python/issues) 10 | 11 | ## Intro 12 | 13 | [PSyKE](https://apice.unibo.it/xwiki/bin/view/PSyKE/) (Platform for Symbolic Knowledge Extraction) 14 | is intended as a library for extracting symbolic knowledge (in the form of logic rules) out of sub-symbolic predictors. 15 | 16 | More precisely, PSyKE offers a general purpose API for knowledge extraction, and a number of different algorithms implementing it, 17 | supporting both classification and regression problems. 18 | The extracted knowledge consists of a Prolog theory (i.e., a list of Horn clauses) or an OWL ontology containing SWRL rules. 19 | 20 | PSyKE relies on [2ppy](https://github.com/tuProlog/2ppy) (tuProlog in Python) for logic support, which in turn is based on the [2p-Kt](https://github.com/tuProlog/2p-kt) logic ecosystem. 21 | 22 | ### Class diagram overview: 23 | 24 | ![PSyKE class diagram](http://www.plantuml.com/plantuml/svg/PLBBRkem4DtdAqQixeLcqsN40aHfLQch2dM341gS0IpoY3oJYfJctnl7RkgcKZRdCUFZ4ozOq4YTPr65we8dWlkgQcuHmEPCfMbW6iDaEe5LXZLJr4QHof3PgxVMGoTtS5XJSNCXkwVxlhdUguzQeUYoi28u3bxNovS0RWnLM7H46mNZXaw6c4UZpq8cW4z6ftGTZoeq4WwjB6x7BbPdoZ7qFMXMXeGU2QKsv2I06HmTiIymfmHOpA1WccjcVSXe_uvPJPn0gfLiEyyTl5bcrtk7qzTNCQYaDBxhyQ6_BFFFEExJ_sLzXoFMLpdcVMrZrhVNvS83zygFmrv-1fMXL5lOezH5rH_z7qqWqonRbn-72-nwAxaz_r8KP9B_YNz3uTP0jFcmAt6xB9gT3UJSC8_Z87G2PIrLBL0UemKLQPrdNm00) 25 | 26 | 29 | 30 | PSyKE is designed around the notion of _extractor_. 31 | More precisely, an `Extractor` is any object capable of extracting a logic `Theory` out of a trained sub-symbolic regressor or classifier. 32 | Accordingly, an `Extractor` is composed of 33 | _(i)_ a trained predictor (i.e., black-box used as an oracle) and 34 | _(ii)_ a set of feature descriptors, and it provides two methods: 35 | * `extract`: returns a logic theory given a dataset; 36 | * `predict`: predicts a value using the extracted rules (instead of the original predictor). 37 | 38 | Currently, the supported extraction algorithms are: 39 | * [CART](https://doi.org/10.1201/9781315139470), 40 | straightforward extracts rules from both classification and regression decision trees; 41 | * Classification: 42 | * [REAL](http://dx.doi.org/10.1016/B978-1-55860-335-6.50013-1) (Rule Extraction As Learning), 43 | generates and generalizes rules strarting from dataset samples; 44 | * [Trepan](http://dx.doi.org/10.1016/B978-1-55860-335-6.50013-1), 45 | generates rules by inducing a decision tree and possibly exploiting m-of-n expressions; 46 | * Regression: 47 | * [ITER](http://dx.doi.org/10.1007/11823728_26), 48 | builds and iteratively expands hypercubes in the input space. 49 | Each cube holds a constant value, that is the estimated output for the samples inside the cube; 50 | * [GridEx](http://dx.doi.org/10.1007/978-3-030-82017-6_2), 51 | extension of the ITER algorithm that produces shorter rule lists retaining higher fidelity w.r.t. the predictor. 52 | * GridREx, 53 | extension of GridEx where the output of each hypercube is a linear combination of the input variables and not a constant value. 54 | 55 | Users may exploit the PEDRO algorithm, included in PSyKE, to tune the optimal values for GridEx and GridREx hyper-parameters. 56 | 57 | We are working on PSyKE to extend its features to encompass explainable clustering tasks, as well as to make more general-purpose the supported extraction algorithms (e.g., by adding classification support to GridEx and GridREx). 58 | 59 | ## Users 60 | 61 | ### End users 62 | 63 | PSyKE is deployed as a library on Pypi, and it can therefore be installed as Python package by running: 64 | ```bash 65 | pip install psyke 66 | ``` 67 | 68 | #### Requirements 69 | * `numpy` 70 | * `pandas` 71 | * `scikit-learn` 72 | * `2ppy` 73 | 74 | ##### Test requirements 75 | * `skl2onnx` 76 | * `onnxruntime` 77 | * `parameterized` 78 | 79 | Once installed, it is possible to create an extractor from a predictor 80 | (e.g. Neural Network, Support Vector Machine, K-Nearest Neighbor, Random Forest, etc.) 81 | and from the dataset used to train the predictor. 82 | 83 | > **Note:** the predictor must expose a method named `predict` to be properly used as an oracle. 84 | 85 | #### End users 86 | 87 | A brief example is presented in `demo.py` script in the `demo/` folder. 88 | Using `sklearn`'s Iris dataset we train a K-Nearest Neighbor to predict the correct output class. 89 | Before training, we make the dataset discrete. 90 | After that we create two different extractors: REAL and Trepan. 91 | We output the extracted theory for both extractors. 92 | 93 | REAL extracted rules: 94 | ``` 95 | iris(PetalLength, PetalWidth, SepalLength, SepalWidth, setosa) :- PetalWidth =< 1.0. 96 | iris(PetalLength1, PetalWidth1, SepalLength1, SepalWidth1, versicolor) :- PetalLength1 > 4.9, SepalWidth1 in [2.9, 3.2]. 97 | iris(PetalLength2, PetalWidth2, SepalLength2, SepalWidth2, versicolor) :- PetalWidth2 > 1.6. 98 | iris(PetalLength3, PetalWidth3, SepalLength3, SepalWidth3, virginica) :- SepalWidth3 =< 2.9. 99 | iris(PetalLength4, PetalWidth4, SepalLength4, SepalWidth4, virginica) :- SepalLength4 in [5.4, 6.3]. 100 | iris(PetalLength5, PetalWidth5, SepalLength5, SepalWidth5, virginica) :- PetalWidth5 in [1.0, 1.6]. 101 | ``` 102 | 103 | Trepan extracted rules: 104 | ``` 105 | iris(PetalLength6, PetalWidth6, SepalLength6, SepalWidth6, virginica) :- PetalLength6 > 3.0, PetalLength6 in [3.0, 4.9]. 106 | iris(PetalLength7, PetalWidth7, SepalLength7, SepalWidth7, versicolor) :- PetalLength7 > 3.0. 107 | iris(PetalLength8, PetalWidth8, SepalLength8, SepalWidth8, setosa) :- true. 108 | ``` 109 | 110 | 111 | ## Developers 112 | 113 | Working with PSyKE codebase requires a number of tools to be installed: 114 | * Python 3.9 115 | + Python version greater than `3.9.x` are currently __not__ supported 116 | 117 | * JDK 11+ (please ensure the `JAVA_HOME` environment variable is properly configured) 118 | * Git 2.20+ 119 | 120 | ### Develop PSyKE with PyCharm 121 | 122 | To participate in the development of PSyKE, we suggest the [PyCharm](https://www.jetbrains.com/pycharm/) IDE. 123 | 124 | #### Importing the project 125 | 126 | 1. Clone this repository in a folder of your preference using `git_clone` appropriately 127 | 2. Open PyCharm 128 | 3. Select `Open` 129 | 4. Navigate your file system and find the folder where you cloned the repository 130 | 5. Click `Open` 131 | 132 | ### Developing the project 133 | 134 | Contributions to this project are welcome. Just some rules: 135 | * We use [git flow](https://github.com/nvie/gitflow), so if you write new features, please do so in a separate `feature/` branch 136 | * We recommend forking the project, developing your code, then contributing back via pull request 137 | * Commit often 138 | * Stay in sync with the `develop` (or `master`) branch (pull frequently if the build passes) 139 | * Do not introduce low quality or untested code 140 | 141 | #### Issue tracking 142 | If you meet some problems in using or developing PSyKE, you are encouraged to signal it through the project 143 | ["Issues" section](https://github.com/psykei/psyke-python/issues) on GitHub. 144 | -------------------------------------------------------------------------------- /psyke/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from typing import Iterable 3 | 4 | from psyke import Clustering, Target 5 | from psyke.extraction.hypercubic import HyperCube 6 | from psyke.hypercubepredictor import HyperCubePredictor 7 | 8 | 9 | class HyperCubeClustering(HyperCubePredictor, Clustering, ABC): 10 | 11 | def __init__(self, output: Target = Target.CONSTANT, discretization=None, normalization=None): 12 | HyperCubePredictor.__init__(self, output=output, discretization=discretization, normalization=normalization) 13 | 14 | def get_hypercubes(self) -> Iterable[HyperCube]: 15 | raise NotImplementedError('get_hypercubes') 16 | -------------------------------------------------------------------------------- /psyke/clustering/cream/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Iterable 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from psyke.utils import Target, get_default_random_seed 9 | from psyke.clustering.exact import ExACT 10 | from psyke.extraction.hypercubic import Node, HyperCube, ClosedCube 11 | from psyke.clustering.utils import select_gaussian_mixture 12 | 13 | 14 | class CREAM(ExACT): 15 | """ 16 | Explanator implementing CREAM algorithm. 17 | """ 18 | 19 | def __init__(self, depth: int, error_threshold: float, output: Target = Target.CONSTANT, gauss_components: int = 5, 20 | discretization=None, normalization=None, seed: int = get_default_random_seed()): 21 | super().__init__(depth, error_threshold, output, gauss_components, discretization, normalization, seed) 22 | 23 | def __eligible_cubes(self, gauss_pred: np.ndarray, node: Node, clusters: int): 24 | cubes = [] 25 | for i in range(len(np.unique(gauss_pred))): 26 | df = node.dataframe.iloc[np.where(gauss_pred == i)] 27 | if len(df) == 0: 28 | continue 29 | inner_cube = self._create_cube(df, clusters) 30 | indices = self._indices(inner_cube, node.dataframe) 31 | if indices is None: 32 | continue 33 | right, left = self._split(inner_cube, node.cube, node.dataframe, indices) 34 | cubes.append(( 35 | ((right.diversity + left.diversity) / 2, right.volume(), left.volume(), i), 36 | (right, indices), (left, ~indices) 37 | )) 38 | return cubes 39 | 40 | def _split(self, right: ClosedCube, outer_cube: ClosedCube, data: pd.DataFrame, indices: np.ndarray): 41 | right.update(data.iloc[indices], self._predictor) 42 | left = outer_cube.copy() 43 | left.update(data.iloc[~indices], self._predictor) 44 | return right, left 45 | 46 | def _iterate(self, surrounding: Node) -> Iterable[HyperCube]: 47 | to_split = [(self.error_threshold * 10, 1, 1, surrounding)] 48 | while len(to_split) > 0: 49 | to_split.sort(reverse=True) 50 | (_, depth, _, node) = to_split.pop() 51 | data = ExACT._remove_string_label(node.dataframe) 52 | gauss_params = select_gaussian_mixture(data, self.gauss_components) 53 | gauss_pred = gauss_params[2].predict(data) 54 | cubes = self.__eligible_cubes(gauss_pred, node, gauss_params[1]) 55 | if len(cubes) < 1: 56 | continue 57 | _, right, left = min(cubes) 58 | # find_better_constraints(node.dataframe[right[1]], right[0]) 59 | node.right = Node(node.dataframe[right[1]], right[0]) 60 | node.cube.update(node.dataframe[left[1]], self._predictor) 61 | node.left = Node(node.dataframe[left[1]], left[0]) 62 | 63 | if depth < self.depth: 64 | to_split += [ 65 | (error, depth + 1, np.random.uniform(), n) for (n, error) in 66 | zip(node.children, [right[0].diversity, left[0].diversity]) if error > self.error_threshold 67 | ] 68 | return self._node_to_cubes(surrounding) -------------------------------------------------------------------------------- /psyke/clustering/exact/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC 4 | from collections import Counter 5 | from typing import Iterable, Union 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn.cluster import DBSCAN 10 | from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor 11 | 12 | from psyke.clustering import HyperCubeClustering 13 | from psyke.extraction.hypercubic import Node, ClosedCube, HyperCube 14 | from psyke.clustering.utils import select_gaussian_mixture, select_dbscan_epsilon 15 | from psyke.extraction.hypercubic.hypercube import ClosedRegressionCube, ClosedClassificationCube 16 | from psyke.utils import Target, get_default_random_seed 17 | 18 | 19 | class ExACT(HyperCubeClustering, ABC): 20 | """ 21 | Explanator implementing ExACT algorithm. 22 | """ 23 | 24 | def __init__(self, depth: int = 2, error_threshold: float = 0.1, output: Target = Target.CONSTANT, 25 | gauss_components: int = 2, discretization=None, normalization=None, 26 | seed: int = get_default_random_seed()): 27 | super().__init__(output, discretization, normalization) 28 | self.depth = depth 29 | self.error_threshold = error_threshold 30 | self.gauss_components = gauss_components 31 | self._predictor = KNeighborsClassifier() if output == Target.CLASSIFICATION else KNeighborsRegressor() 32 | self._predictor.n_neighbors = 1 33 | self.seed = seed 34 | 35 | def __eligible_cubes(self, gauss_pred: np.ndarray, node: Node, clusters: int): 36 | cubes = [] 37 | for i in range(len(np.unique(gauss_pred))): 38 | df = node.dataframe.iloc[np.where(gauss_pred == i)] 39 | if len(df) == 0: 40 | continue 41 | cubes.append(self._create_cube(df, clusters)) 42 | indices = [self._indices(cube, node.dataframe) for cube in cubes] 43 | return cubes, indices 44 | 45 | @staticmethod 46 | def _indices(cube: ClosedCube, data: pd.DataFrame) -> np.ndarray | None: 47 | indices = cube.filter_indices(data.iloc[:, :-1]) 48 | if len(data.iloc[indices]) * len(data.iloc[~indices]) == 0: 49 | return None 50 | return indices 51 | 52 | def _create_cube(self, dataframe: pd.DataFrame, clusters: int) -> ClosedCube: 53 | data = ExACT._remove_string_label(dataframe) 54 | dbscan_pred = DBSCAN(eps=select_dbscan_epsilon(data, clusters)).fit_predict(data.iloc[:, :-1]) 55 | return HyperCube.create_surrounding_cube( 56 | dataframe.iloc[np.where(dbscan_pred == Counter(dbscan_pred).most_common(1)[0][0])], 57 | True, self._output 58 | ) 59 | 60 | def fit(self, dataframe: pd.DataFrame): 61 | np.random.seed(self.seed) 62 | self._predictor.fit(dataframe.iloc[:, :-1], dataframe.iloc[:, -1]) 63 | self._surrounding = HyperCube.create_surrounding_cube(dataframe, True, self._output) 64 | self._hypercubes = self._iterate(Node(dataframe, self._surrounding)) 65 | 66 | def get_hypercubes(self) -> Iterable[HyperCube]: 67 | return list(self._hypercubes) 68 | 69 | def explain(self): 70 | for cube in self._hypercubes: 71 | print(f'Output is {cube.output} if:') 72 | for feature in cube.dimensions: 73 | lower, upper = cube[feature] 74 | print(f' {feature} is in [{lower:.2f}, {upper:.2f}]') 75 | 76 | @staticmethod 77 | def _remove_string_label(dataframe: pd.DataFrame): 78 | return dataframe.replace({dataframe.columns[-1]: {v: k for k, v in dict( 79 | enumerate(dataframe.iloc[:, -1].unique()) 80 | ).items()}}) if isinstance(dataframe.iloc[0, -1], str) else dataframe 81 | 82 | def _iterate(self, surrounding: Node) -> Iterable[HyperCube]: 83 | to_split = [(self.error_threshold * 10, 1, 1, surrounding)] 84 | while len(to_split) > 0: 85 | to_split.sort(reverse=True) 86 | (_, depth, _, node) = to_split.pop() 87 | data = ExACT._remove_string_label(node.dataframe) 88 | gauss_params = select_gaussian_mixture(data, self.gauss_components) 89 | gauss_pred = gauss_params[2].predict(data) 90 | cubes, indices = self.__eligible_cubes(gauss_pred, node, gauss_params[1]) 91 | cubes = [(c.volume(), len(idx), i, idx, c) for i, (c, idx) in enumerate(zip(cubes, indices)) 92 | if (idx is not None) and (not node.cube.equal(c))] 93 | if len(cubes) < 1: 94 | continue 95 | _, _, _, indices, cube = max(cubes) 96 | 97 | cube.update(node.dataframe[indices], self._predictor) 98 | node.right = Node(node.dataframe[indices], cube) 99 | node.cube.update(node.dataframe[~indices], self._predictor) 100 | node.left = Node(node.dataframe[~indices], node.cube) 101 | 102 | if depth < self.depth and cube.diversity > self.error_threshold: 103 | to_split.append((cube.diversity, depth + 1, np.random.uniform(), node.right)) 104 | return self._node_to_cubes(surrounding) 105 | 106 | def _node_to_cubes(self, root: Node) -> list[ClosedCube]: 107 | if root.right is None: 108 | return [root.cube] 109 | else: 110 | return self._node_to_cubes(root.right) + self._node_to_cubes(root.left) 111 | 112 | def _default_cube(self) -> Union[ClosedCube, ClosedRegressionCube, ClosedClassificationCube]: 113 | if self._output == Target.CONSTANT: 114 | return ClosedCube() 115 | if self._output == Target.REGRESSION: 116 | return ClosedRegressionCube() 117 | return ClosedClassificationCube() 118 | -------------------------------------------------------------------------------- /psyke/clustering/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from kneed import KneeLocator 4 | from sklearn.cluster import DBSCAN 5 | from sklearn.mixture import GaussianMixture 6 | from sklearn.neighbors import NearestNeighbors 7 | 8 | 9 | def select_gaussian_mixture(data: pd.DataFrame, max_components) -> tuple[float, int, GaussianMixture]: 10 | components = range(2, max_components + 1) 11 | try: 12 | models = [GaussianMixture(n_components=n).fit(data) for n in components if n <= len(data)] 13 | except ValueError: 14 | print(data) 15 | print(len(data)) 16 | return min([(m.bic(data) / (i + 2), (i + 2), m) for i, m in enumerate(models)]) 17 | 18 | 19 | def select_dbscan_epsilon(data: pd.DataFrame, clusters: int) -> float: 20 | neighbors = NearestNeighbors(n_neighbors=min(len(data.columns) * 2, len(data))).fit(data) 21 | distances = sorted(np.mean(neighbors.kneighbors(data)[1], axis=1), reverse=True) 22 | try: 23 | kn = KneeLocator([d for d in range(len(distances))], distances, 24 | curve='convex', direction='decreasing', online=True) 25 | if kn.knee is None or kn.knee_y is None: 26 | epsilon = max(distances[-1], 1e-3) 27 | else: 28 | epsilon = kn.knee_y 29 | except (RuntimeWarning, UserWarning, ValueError): 30 | epsilon = max(distances[-1], 1e-3) 31 | k = 1. 32 | dbscan_pred = DBSCAN(eps=epsilon * k).fit_predict(data.iloc[:, :-1]) 33 | # while Counter(dbscan_pred).most_common(1)[0][0] == -1: 34 | for i in range(1000): 35 | if len(np.unique(dbscan_pred)) < clusters + 1: 36 | break 37 | k += .1 38 | dbscan_pred = DBSCAN(eps=epsilon * k).fit_predict(data.iloc[:, :-1]) 39 | return epsilon * k 40 | -------------------------------------------------------------------------------- /psyke/extraction/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from collections import Iterable 3 | 4 | import pandas as pd 5 | from tuprolog.theory import Theory 6 | 7 | from psyke import Extractor 8 | 9 | 10 | class PedagogicalExtractor(Extractor, ABC): 11 | 12 | def __init__(self, predictor, discretization=None, normalization=None): 13 | Extractor.__init__(self, predictor=predictor, discretization=discretization, normalization=normalization) 14 | 15 | def _substitute_output(self, dataframe: pd.DataFrame) -> pd.DataFrame: 16 | new_y = pd.DataFrame(self.predictor.predict(dataframe.iloc[:, :-1])).set_index(dataframe.index) 17 | data = dataframe.iloc[:, :-1].copy().join(new_y) 18 | data.columns = dataframe.columns 19 | return data 20 | 21 | def extract(self, dataframe: pd.DataFrame) -> Theory: 22 | self.theory = self._extract(self._substitute_output(dataframe)) 23 | return self.theory 24 | 25 | def _extract(self, dataframe: pd.DataFrame) -> Theory: 26 | raise NotImplementedError('extract') 27 | -------------------------------------------------------------------------------- /psyke/extraction/cart/CartPredictor.py: -------------------------------------------------------------------------------- 1 | from collections import Iterable 2 | from typing import Union, Any 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 6 | from tuprolog.core import clause, Var, Struct 7 | from tuprolog.theory import Theory, mutable_theory 8 | 9 | from psyke.extraction.cart import LeafConstraints, LeafSequence 10 | from psyke.schema import LessThan, GreaterThan, SchemaException, DiscreteFeature 11 | from psyke.utils.logic import create_variable_list, create_head, create_term 12 | 13 | 14 | class CartPredictor: 15 | """ 16 | A wrapper for decision and regression trees of sklearn. 17 | """ 18 | 19 | def __init__(self, predictor: Union[DecisionTreeClassifier, DecisionTreeRegressor] = DecisionTreeClassifier(), 20 | discretization=None, normalization=None): 21 | self._predictor = predictor 22 | self.discretization = discretization 23 | self.normalization = normalization 24 | 25 | def __get_constraints(self, nodes: Iterable[(int, bool)]) -> LeafConstraints: 26 | thresholds = [self._predictor.tree_.threshold[i[0]] for i in nodes] 27 | features = [self._predictor.feature_names_in_[self._predictor.tree_.feature[node[0]]] for node in nodes] 28 | conditions = [node[1] for node in nodes] 29 | if self.normalization is not None: 30 | thresholds = [threshold * self.normalization[feature][1] + self.normalization[feature][0] 31 | for feature, threshold in zip(features, thresholds)] 32 | cond_dict = {} 33 | for feature, condition, threshold in zip(features, conditions, thresholds): 34 | cond = LessThan(threshold) if condition else GreaterThan(threshold) 35 | if feature in cond_dict: 36 | try: 37 | cond_dict[feature][-1] *= cond 38 | except SchemaException: 39 | cond_dict[feature].append(cond) 40 | else: 41 | cond_dict[feature] = [cond] 42 | return cond_dict 43 | 44 | def __get_leaves(self) -> Iterable[int]: 45 | return [i for i, (left_child, right_child) in enumerate(zip( 46 | self._left_children, self._right_children 47 | )) if left_child == -1 and right_child == -1] 48 | 49 | def __get_prediction(self, node: int) -> Any: 50 | if hasattr(self._predictor, 'classes_'): 51 | return self._predictor.classes_[np.argmax(self._predictor.tree_.value[node])] 52 | else: 53 | return self._predictor.tree_.value[node] 54 | 55 | def __path(self, node: int, path=None) -> Iterable[(int, bool)]: 56 | path = [] if path is None else path 57 | if node == 0: 58 | return path 59 | father = list(self._left_children if node in self._left_children else self._right_children).index(node) 60 | return self.__path(father, [(father, node in self._left_children)] + path) 61 | 62 | def __iter__(self) -> LeafSequence: 63 | leaves = self.__get_leaves() 64 | return ((self.__get_constraints(self.__path(i)), self.__get_prediction(i)) for i in leaves) 65 | 66 | def predict(self, data) -> Iterable: 67 | return self._predictor.predict(data) 68 | 69 | @staticmethod 70 | def _simplify_nodes(nodes: list) -> Iterable: 71 | simplified = [nodes.pop(0)] 72 | while len(nodes) > 0: 73 | first_node = nodes[0][0] 74 | for k, conditions in first_node.items(): 75 | for condition in conditions: 76 | if all(k in node[0] and condition in node[0][k] for node in nodes): 77 | [node[0][k].remove(condition) for node in nodes] 78 | simplified.append(nodes.pop(0)) 79 | return [({k: v for k, v in rule.items() if v != []}, prediction) for rule, prediction in simplified] 80 | 81 | def _create_body(self, variables: dict[str, Var], conditions: LeafConstraints) -> Iterable[Struct]: 82 | results = [] 83 | for feature_name, cond_list in conditions.items(): 84 | for condition in cond_list: 85 | feature: DiscreteFeature = [d for d in self.discretization if feature_name in d.admissible_values][0] \ 86 | if self.discretization else None 87 | results.append(create_term(variables[feature_name], condition) if feature is None else 88 | create_term(variables[feature.name], 89 | feature.admissible_values[feature_name], 90 | isinstance(condition, GreaterThan))) 91 | return results 92 | 93 | def create_theory(self, data: pd.DataFrame, simplify: True) -> Theory: 94 | new_theory = mutable_theory() 95 | nodes = [node for node in self] 96 | nodes = self._simplify_nodes(nodes) if simplify else nodes 97 | for (constraints, prediction) in nodes: 98 | if self.normalization is not None and data.columns[-1] in self.normalization: 99 | m, s = self.normalization[data.columns[-1]] 100 | prediction = prediction * s + m 101 | variables = create_variable_list(self.discretization, data) 102 | new_theory.assertZ( 103 | clause( 104 | create_head(data.columns[-1], list(variables.values()), prediction), 105 | self._create_body(variables, constraints) 106 | ) 107 | ) 108 | return new_theory 109 | 110 | @property 111 | def predictor(self) -> Union[DecisionTreeClassifier, DecisionTreeRegressor]: 112 | return self._predictor 113 | 114 | @property 115 | def n_leaves(self) -> int: 116 | return len(list(self.__get_leaves())) 117 | 118 | @property 119 | def _left_children(self) -> list[int]: 120 | return self._predictor.tree_.children_left 121 | 122 | @property 123 | def _right_children(self) -> list[int]: 124 | return self._predictor.tree_.children_right 125 | 126 | @predictor.setter 127 | def predictor(self, predictor: Union[DecisionTreeClassifier, DecisionTreeRegressor]): 128 | self._predictor = predictor 129 | -------------------------------------------------------------------------------- /psyke/extraction/cart/FairTree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import Counter 3 | 4 | from sklearn.metrics import accuracy_score, r2_score 5 | 6 | 7 | class Node: 8 | def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None): 9 | self.feature = feature 10 | self.threshold = threshold 11 | self.left = left 12 | self.right = right 13 | self.value = value 14 | 15 | def is_leaf_node(self): 16 | return self.value is not None 17 | 18 | 19 | class FairTree: 20 | def __init__(self, max_depth=3, max_leaves=None, criterion=None, min_samples_split=2, lambda_penalty=0.0, 21 | protected_attr=None): 22 | self.max_depth = max_depth 23 | self.max_leaves = max_leaves 24 | self.min_samples_split = min_samples_split 25 | self.lambda_penalty = lambda_penalty 26 | self.protected_attr = protected_attr 27 | self.criterion = criterion 28 | self.root = None 29 | self.n_leaves = 0 30 | self.quality_function = None 31 | 32 | def fit(self, X, y): 33 | self.n_leaves = 0 34 | self.root = self._grow_tree(X, y, depth=0) 35 | while self.n_leaves > self.max_leaves: 36 | self.prune_least_important_leaf(X, y) 37 | self.n_leaves -= 1 38 | return self 39 | 40 | @staticmethod 41 | def _estimate_output(y): 42 | raise NotImplementedError 43 | 44 | def score(self, X, y): 45 | raise NotImplementedError 46 | 47 | def predict(self, X): 48 | return np.array([self._traverse_tree(x, self.root) for _, x in X.iterrows()]) 49 | 50 | def _traverse_tree(self, x, node): 51 | if node.is_leaf_node(): 52 | return node.value 53 | if x[node.feature] <= node.threshold: 54 | return self._traverse_tree(x, node.left) 55 | return self._traverse_tree(x, node.right) 56 | 57 | def _grow_tree(self, X, y, depth): 58 | if depth >= self.max_depth or X.shape[0] < self.min_samples_split or len(set(y.values.flatten())) == 1 or \ 59 | (self.max_leaves is not None and self.n_leaves >= self.max_leaves): 60 | self.n_leaves += 1 61 | return Node(value=self._estimate_output(y)) 62 | 63 | best_feature, best_threshold = self._best_split(X, y) 64 | if best_feature is None: 65 | self.n_leaves += 1 66 | return Node(value=self._estimate_output(y)) 67 | 68 | left_idxs = X[best_feature] <= best_threshold 69 | right_idxs = X[best_feature] > best_threshold 70 | 71 | left = self._grow_tree(X[left_idxs], y[left_idxs], depth + 1) 72 | right = self._grow_tree(X[right_idxs], y[right_idxs], depth + 1) 73 | return Node(best_feature, best_threshold, left, right) 74 | 75 | @staticmethod 76 | def generate_thresholds(X, y): 77 | sorted_indices = np.argsort(X) 78 | X = np.array(X)[sorted_indices] 79 | y = np.array(y)[sorted_indices] 80 | return np.array([(X[i] + X[i - 1]) / 2.0 for i in range(1, len(X)) if y[i] != y[i - 1]]) 81 | 82 | def _best_split(self, X, y): 83 | best_gain = -float('inf') 84 | split_idx, split_threshold = None, None 85 | 86 | for feature in [feature for feature in X.columns if feature not in self.protected_attr]: 87 | # for threshold in np.unique(np.quantile(X[feature], np.linspace(0, 1, num=25))): 88 | for threshold in self.generate_thresholds(X[feature], y): 89 | left_idxs = X[feature] <= threshold 90 | right_idxs = X[feature] > threshold 91 | 92 | if left_idxs.sum() == 0 or right_idxs.sum() == 0: 93 | continue 94 | 95 | gain = self._fair_gain(y, left_idxs, right_idxs, X[self.protected_attr]) 96 | 97 | if gain > best_gain: 98 | best_gain = gain 99 | split_idx = feature 100 | split_threshold = threshold 101 | return split_idx, split_threshold 102 | 103 | @staticmethod 104 | def _disparity(group): 105 | counts = Counter(group) 106 | if len(counts) <= 1: 107 | return 0.0 108 | values = np.array(list(counts.values())) / len(group) 109 | return np.abs(values[0] - values[1]) 110 | 111 | def _fair_gain(self, y, left_idx, right_idx, protected): 112 | child = len(y[left_idx]) / len(y) * self.quality_function(y[left_idx]) + \ 113 | len(y[right_idx]) / len(y) * self.quality_function(y[right_idx]) 114 | info_gain = self.quality_function(y) - child 115 | penalty = self._disparity(protected[left_idx]) + self._disparity(protected[right_idx]) 116 | return info_gain - self.lambda_penalty * penalty 117 | 118 | @staticmethod 119 | def _match_path(x, path): 120 | for node, left in path: 121 | if left and x[node.feature] > node.threshold: 122 | return False 123 | if not left and x[node.feature] <= node.threshold: 124 | return False 125 | return True 126 | 127 | @staticmethod 128 | def candidates(node, parent=None, is_left=None, path=[]): 129 | if node is None or node.is_leaf_node(): 130 | return [] 131 | leaves = [] 132 | if node.left.is_leaf_node() and node.right.is_leaf_node(): 133 | leaves.append((node, parent, is_left, path)) 134 | leaves += FairTreeClassifier.candidates(node.left, node, True, path + [(node, True)]) 135 | leaves += FairTreeClassifier.candidates(node.right, node, False, path + [(node, False)]) 136 | return leaves 137 | 138 | def prune_least_important_leaf(self, X, y): 139 | best_score = -np.inf 140 | best_prune = None 141 | 142 | for node, parent, is_left, path in self.candidates(self.root): 143 | original_left = node.left 144 | original_right = node.right 145 | 146 | merged_y = y[(X.apply(lambda x: self._match_path(x, path), axis=1))] 147 | if len(merged_y) == 0: 148 | continue 149 | new_value = self._estimate_output(merged_y) 150 | node.left = node.right = None 151 | node.value = new_value 152 | 153 | score = self.score(X, y) 154 | if score >= best_score: 155 | best_score = score 156 | best_prune = (node, new_value) 157 | 158 | node.left, node.right, node.value = original_left, original_right, None 159 | 160 | if best_prune: 161 | best_prune[0].left = best_prune[0].right = None 162 | best_prune[0].value = best_prune[1] 163 | 164 | 165 | class FairTreeClassifier(FairTree): 166 | def __init__(self, max_depth=3, max_leaves=None, criterion='entropy', min_samples_split=2, lambda_penalty=0.0, 167 | protected_attr=None): 168 | super().__init__(max_depth, max_leaves, criterion, min_samples_split, lambda_penalty, protected_attr) 169 | self.quality_function = self._gini if self.criterion == 'gini' else self._entropy 170 | 171 | @staticmethod 172 | def _estimate_output(y): 173 | return Counter(y.values.flatten()).most_common(1)[0][0] 174 | 175 | def score(self, X, y): 176 | return accuracy_score(y.values.flatten(), self.predict(X)) 177 | 178 | @staticmethod 179 | def _entropy(y): 180 | ps = np.unique(y, return_counts=True)[1] / len(y) 181 | return -np.sum([p * np.log2(p) for p in ps if p > 0]) 182 | 183 | @staticmethod 184 | def _gini(y): 185 | return 1.0 - np.sum(np.unique(y, return_counts=True)[1] / len(y)**2) 186 | 187 | 188 | class FairTreeRegressor(FairTree): 189 | def __init__(self, max_depth=3, max_leaves=None, criterion='mse', min_samples_split=2, lambda_penalty=0.0, 190 | protected_attr=None): 191 | super().__init__(max_depth, max_leaves, criterion, min_samples_split, lambda_penalty, protected_attr) 192 | self.quality_function = self._mse 193 | 194 | @staticmethod 195 | def _estimate_output(y): 196 | return np.mean(y.values.flatten()) 197 | 198 | def score(self, X, y): 199 | return r2_score(y.values.flatten(), self.predict(X)) 200 | 201 | @staticmethod 202 | def _mse(y): 203 | y = y.values.flatten().astype(float) 204 | return np.mean((y - np.mean(y))**2) 205 | -------------------------------------------------------------------------------- /psyke/extraction/cart/FairTreePredictor.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Union, Any 3 | 4 | from psyke.extraction.cart import FairTreeClassifier, FairTreeRegressor, LeafSequence, LeafConstraints 5 | from psyke.extraction.cart.CartPredictor import CartPredictor 6 | from psyke.schema import LessThan, GreaterThan, SchemaException, Value 7 | 8 | 9 | class FairTreePredictor(CartPredictor): 10 | """ 11 | A wrapper for fair decision and regression trees of psyke. 12 | """ 13 | 14 | def __init__(self, predictor: Union[FairTreeClassifier, FairTreeRegressor] = FairTreeClassifier(), 15 | discretization=None, normalization=None): 16 | super().__init__(predictor, discretization, normalization) 17 | 18 | def __iter__(self) -> LeafSequence: 19 | leaves = [node for node in self.recurse(self._predictor.root, {})] 20 | return (leaf for leaf in leaves) 21 | 22 | @staticmethod 23 | def merge_constraints(constraints: LeafConstraints, constraint: Value, feature: str): 24 | if feature in constraints: 25 | try: 26 | constraints[feature][-1] *= constraint 27 | except SchemaException: 28 | constraints[feature].append(constraint) 29 | else: 30 | constraints[feature] = [constraint] 31 | return constraints 32 | 33 | def recurse(self, node, constraints) -> Union[LeafSequence, tuple[LeafConstraints, Any]]: 34 | if node.is_leaf_node(): 35 | return constraints, node.value 36 | 37 | feature = node.feature 38 | threshold = node.threshold if self.normalization is None else \ 39 | (node.threshold * self.normalization[feature][1] + self.normalization[feature][0]) 40 | 41 | left = self.recurse(node.left, self.merge_constraints(copy.deepcopy(constraints), LessThan(threshold), feature)) 42 | right = self.recurse(node.right, self.merge_constraints(copy.deepcopy(constraints), 43 | GreaterThan(threshold), feature)) 44 | return (left if isinstance(left, list) else [left]) + (right if isinstance(right, list) else [right]) 45 | 46 | @property 47 | def predictor(self) -> Union[FairTreeClassifier, FairTreeRegressor]: 48 | return self._predictor 49 | 50 | @property 51 | def n_leaves(self) -> int: 52 | return self._predictor.n_leaves 53 | 54 | @predictor.setter 55 | def predictor(self, predictor: Union[FairTreeClassifier, FairTreeRegressor]): 56 | self._predictor = predictor 57 | -------------------------------------------------------------------------------- /psyke/extraction/cart/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 4 | 5 | from psyke.extraction import PedagogicalExtractor 6 | from psyke import get_default_random_seed 7 | from psyke.extraction.cart.FairTree import FairTreeClassifier, FairTreeRegressor 8 | from psyke.schema import DiscreteFeature, Value 9 | from tuprolog.theory import Theory 10 | from typing import Iterable, Any 11 | import pandas as pd 12 | 13 | 14 | TREE_SEED = get_default_random_seed() 15 | 16 | LeafConstraints = dict[str, list[Value]] 17 | LeafSequence = Iterable[tuple[LeafConstraints, Any]] 18 | 19 | 20 | class Cart(PedagogicalExtractor, ABC): 21 | 22 | def __init__(self, predictor, max_depth: int = 3, max_leaves: int = None, max_features=None, 23 | discretization: Iterable[DiscreteFeature] = None, 24 | normalization=None, simplify: bool = True): 25 | from psyke.extraction.cart.CartPredictor import CartPredictor 26 | 27 | super().__init__(predictor, discretization, normalization) 28 | self.is_fair = None 29 | self._cart_predictor = CartPredictor(discretization=discretization, normalization=normalization) 30 | self.depth = max_depth 31 | self.leaves = max_leaves 32 | self.max_features = max_features 33 | self._simplify = simplify 34 | 35 | def _extract(self, data: pd.DataFrame) -> Theory: 36 | from psyke.extraction.cart.FairTreePredictor import FairTreePredictor 37 | 38 | if self.is_fair: 39 | self._cart_predictor = FairTreePredictor(discretization=self.discretization, 40 | normalization=self.normalization) 41 | fair_tree = FairTreeClassifier if isinstance(data.iloc[0, -1], str) else FairTreeRegressor 42 | self._cart_predictor.predictor = fair_tree(max_depth=self.depth, max_leaves=self.leaves, 43 | protected_attr=self.is_fair) 44 | else: 45 | tree = DecisionTreeClassifier if isinstance(data.iloc[0, -1], str) else DecisionTreeRegressor 46 | self._cart_predictor.predictor = tree(random_state=TREE_SEED, max_depth=self.depth, 47 | max_leaf_nodes=self.leaves, max_features=self.max_features) 48 | self._cart_predictor.predictor.fit(data.iloc[:, :-1], data.iloc[:, -1]) 49 | return self._cart_predictor.create_theory(data, self._simplify) 50 | 51 | def make_fair(self, features: Iterable[str]): 52 | self.is_fair = features 53 | 54 | def _predict(self, dataframe: pd.DataFrame) -> Iterable: 55 | return self._cart_predictor.predict(dataframe) 56 | 57 | def predict_why(self, data: dict[str, float], verbose=True): 58 | prediction = None 59 | conditions = {} 60 | if self.normalization is not None: 61 | data = {k: v * self.normalization[k][1] + self.normalization[k][0] if k in self.normalization else v 62 | for k, v in data.items()} 63 | for conditions, prediction in self._cart_predictor: 64 | if all(all(interval.is_in(data[variable]) for interval in intervals) 65 | for variable, intervals in conditions.items()): 66 | break 67 | return prediction, conditions 68 | 69 | @property 70 | def n_rules(self) -> int: 71 | return self._cart_predictor.n_leaves 72 | -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC 4 | from collections import Iterable 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn.base import ClassifierMixin 9 | from sklearn.feature_selection import SelectKBest, f_regression, f_classif 10 | from sklearn.linear_model import LinearRegression 11 | from tuprolog.core import Var, Struct, clause 12 | from tuprolog.theory import Theory, mutable_theory 13 | from psyke.extraction import PedagogicalExtractor 14 | from psyke.extraction.hypercubic.hypercube import HyperCube, RegressionCube, ClassificationCube, ClosedCube, Point, \ 15 | GenericCube 16 | from psyke.hypercubepredictor import HyperCubePredictor 17 | from psyke.schema import Value 18 | from psyke.utils.logic import create_variable_list, create_head, to_var, Simplifier 19 | from psyke.utils import Target 20 | from psyke.extraction.hypercubic.strategy import Strategy, FixedStrategy 21 | 22 | 23 | class HyperCubeExtractor(HyperCubePredictor, PedagogicalExtractor, ABC): 24 | def __init__(self, predictor, output, discretization=None, normalization=None): 25 | HyperCubePredictor.__init__(self, output=output, normalization=normalization) 26 | PedagogicalExtractor.__init__(self, predictor, discretization=discretization, normalization=normalization) 27 | self._default_surrounding_cube = False 28 | 29 | def _default_cube(self) -> HyperCube | RegressionCube | ClassificationCube: 30 | if self._output == Target.CONSTANT: 31 | return HyperCube() 32 | if self._output == Target.REGRESSION: 33 | return RegressionCube() 34 | return ClassificationCube() 35 | 36 | def _sort_cubes(self): 37 | cubes = [(cube.diversity, i, cube) for i, cube in enumerate(self._hypercubes)] 38 | cubes.sort() 39 | self._hypercubes = [cube[2] for cube in cubes] 40 | 41 | def extract(self, dataframe: pd.DataFrame) -> Theory: 42 | theory = PedagogicalExtractor.extract(self, dataframe) 43 | self._surrounding = HyperCube.create_surrounding_cube(dataframe, output=self._output) 44 | self._surrounding.update(dataframe, self.predictor) 45 | return theory 46 | 47 | def pairwise_fairness(self, data: dict[str, float], neighbor: dict[str, float]): 48 | cube1 = self._find_cube(data) 49 | cube2 = self._find_cube(neighbor) 50 | different_prediction_reasons = [] 51 | 52 | if cube1.output == cube2.output: 53 | print("Prediction", cube1.output, "is FAIR") 54 | else: 55 | print("Prediction", cube1.output, "may be UNFAIR") 56 | print("It could be", cube2.output, "if:") 57 | for d in data: 58 | a, b = cube2.dimensions[d] 59 | if data[d] < a: 60 | print(' ', d, 'increases above', round(a, 1)) 61 | different_prediction_reasons.append(d) 62 | elif data[d] > b: 63 | print(' ', d, 'decreases below', round(b, 1)) 64 | different_prediction_reasons.append(d) 65 | return different_prediction_reasons 66 | 67 | def predict_counter(self, data: dict[str, float], verbose=True, only_first=True): 68 | output = "" 69 | prediction = None 70 | cube = self._find_cube(data) 71 | if cube is None: 72 | output += "The extracted knowledge is not exhaustive; impossible to predict this instance" 73 | else: 74 | prediction = self._predict_from_cubes(data) 75 | output += f"The output is {prediction}\n" 76 | 77 | point = Point(list(data.keys()), list(data.values())) 78 | cubes = self._hypercubes if cube is None else [c for c in self._hypercubes if cube.output != c.output] 79 | cubes = sorted([(cube.surface_distance(point), cube.volume(), i, cube) for i, cube in enumerate(cubes)]) 80 | 81 | counter_conditions = [] 82 | 83 | for _, _, _, c in cubes: 84 | if not only_first or c.output not in [o for o, _ in counter_conditions]: 85 | counter_conditions.append((c.output, {c: [val for val in v if val is not None and not val.is_in( 86 | self.unscale(data[c], c))] for c, v in self.__get_conditions(data, c).items()})) 87 | 88 | if verbose: 89 | for o, conditions in counter_conditions: 90 | output += f"The output may be {o} if\n" + HyperCubeExtractor.__conditions_to_string(conditions) 91 | print(output) 92 | 93 | return prediction, counter_conditions 94 | 95 | @staticmethod 96 | def __conditions_to_string(conditions: dict[str, list[Value]]) -> str: 97 | output = "" 98 | for d in conditions: 99 | for i, condition in enumerate(conditions[d]): 100 | if i == 0: 101 | output += f' {d} is ' 102 | else: 103 | output += ' and ' 104 | output += condition.print() 105 | if i + 1 == len(conditions[d]): 106 | output += '\n' 107 | return output 108 | 109 | def __get_conditions(self, data: dict[str, float], cube: GenericCube) -> dict[str, list[Value]]: 110 | conditions = {d: [cube.interval_to_value(d, self.unscale)] for d in data.keys() 111 | if d not in self._dimensions_to_ignore} 112 | for c in cube.subcubes(self._hypercubes): 113 | for d in conditions: 114 | condition = c.interval_to_value(d, self.unscale) 115 | if condition is None: 116 | continue 117 | elif conditions[d][-1] is None: 118 | conditions[d][-1] = -condition 119 | else: 120 | try: 121 | conditions[d][-1] *= -condition 122 | except Exception: 123 | conditions[d].append(-condition) 124 | return conditions 125 | 126 | def predict_why(self, data: dict[str, float], verbose=True): 127 | cube = self._find_cube(data) 128 | output = "" 129 | if cube is None: 130 | output += "The extracted knowledge is not exhaustive; impossible to predict this instance\n" 131 | if verbose: 132 | print(output) 133 | return None, {} 134 | prediction = self._predict_from_cubes(data) 135 | output += f"The output is {prediction} because\n" 136 | conditions = {c: [val for val in v if val is not None and val.is_in(self.unscale(data[c], c))] 137 | for c, v in self.__get_conditions(data, cube).items()} 138 | 139 | if verbose: 140 | output += HyperCubeExtractor.__conditions_to_string(conditions) 141 | print(output) 142 | 143 | return prediction, conditions 144 | 145 | @staticmethod 146 | def _create_head(dataframe: pd.DataFrame, variables: list[Var], output: float | LinearRegression) -> Struct: 147 | return create_head(dataframe.columns[-1], variables[:-1], output) \ 148 | if not isinstance(output, LinearRegression) else \ 149 | create_head(dataframe.columns[-1], variables[:-1], variables[-1]) 150 | 151 | def __drop(self, dataframe: pd.DataFrame): 152 | self._hypercubes = [cube for cube in self._hypercubes if cube.count(dataframe) > 1] 153 | 154 | def _create_theory(self, dataframe: pd.DataFrame) -> Theory: 155 | # self.__drop(dataframe) 156 | for cube in self._hypercubes: 157 | for dimension in cube.dimensions: 158 | if abs(cube[dimension][0] - self._surrounding[dimension][0]) < HyperCube.EPSILON * 2: 159 | cube.set_infinite(dimension, '-') 160 | if abs(cube[dimension][1] - self._surrounding[dimension][1]) < HyperCube.EPSILON * 2: 161 | cube.set_infinite(dimension, '+') 162 | 163 | if self._default_surrounding_cube: 164 | self._hypercubes[-1].set_default() 165 | 166 | new_theory = mutable_theory() 167 | for cube in self._hypercubes: 168 | variables = create_variable_list([], dataframe) 169 | variables[dataframe.columns[-1]] = to_var(dataframe.columns[-1]) 170 | head = HyperCubeExtractor._create_head(dataframe, list(variables.values()), 171 | self.unscale(cube.output, dataframe.columns[-1])) 172 | body = cube.body(variables, self._dimensions_to_ignore, self.unscale, self.normalization) 173 | new_theory.assertZ(clause(head, body)) 174 | return HyperCubeExtractor._prettify_theory(new_theory) 175 | 176 | @staticmethod 177 | def _prettify_theory(theory: Theory) -> Theory: 178 | visitor = Simplifier() 179 | new_clauses = [] 180 | for c in theory.clauses: 181 | body = c.body 182 | structs = body.unfolded if c.body_size > 1 else [body] 183 | new_structs = [] 184 | for s in structs: 185 | new_structs.append(s.accept(visitor)) 186 | new_clauses.append(clause(c.head, new_structs)) 187 | return mutable_theory(new_clauses) 188 | 189 | 190 | class FeatureRanker: 191 | def __init__(self, feat): 192 | self.scores = None 193 | self.feat = feat 194 | 195 | def fit(self, model, samples): 196 | predictions = np.array(model.predict(samples)).flatten() 197 | function = f_classif if isinstance(model, ClassifierMixin) else f_regression 198 | best = SelectKBest(score_func=function, k="all").fit(samples, predictions) 199 | self.scores = np.array(best.scores_) / max(best.scores_) 200 | return self 201 | 202 | def fit_on_data(self, samples): 203 | function = f_classif if isinstance(samples.iloc[0, -1], str) else f_regression 204 | best = SelectKBest(score_func=function, k="all").fit(samples.iloc[:, :-1], samples.iloc[:, -1]) 205 | self.scores = np.array(best.scores_) / max(best.scores_) 206 | return self 207 | 208 | def rankings(self): 209 | return list(zip(self.feat, self.scores)) 210 | 211 | 212 | class Grid: 213 | def __init__(self, iterations: int = 1, strategy: Strategy | Iterable[Strategy] = FixedStrategy()): 214 | self.iterations = iterations 215 | self.strategy = strategy 216 | 217 | def make_fair(self, features: Iterable[str]): 218 | if isinstance(self.strategy, Strategy): 219 | self.strategy.make_fair(features) 220 | elif isinstance(self.strategy, Iterable): 221 | [strategy.make_fair(features) for strategy in self.strategy] 222 | 223 | def get(self, feature: str, depth: int) -> int: 224 | if isinstance(self.strategy, list): 225 | return self.strategy[depth].get(feature) 226 | else: 227 | return self.strategy.get(feature) 228 | 229 | def iterate(self) -> range: 230 | return range(self.iterations) 231 | 232 | def __repr__(self): 233 | return self.__str__() 234 | 235 | def __str__(self): 236 | return "Grid ({}). {}".format(self.iterations, self.strategy) 237 | 238 | 239 | class Node: 240 | def __init__(self, dataframe: pd.DataFrame, cube: ClosedCube = None): 241 | self.dataframe = dataframe 242 | self.cube: ClosedCube = cube 243 | self.right: Node | None = None 244 | self.left: Node | None = None 245 | 246 | @property 247 | def children(self) -> list[Node]: 248 | return [self.right, self.left] 249 | 250 | def search(self, point: dict[str, float]) -> ClosedCube: 251 | if self.right is None: 252 | return self.cube 253 | if point in self.right.cube: 254 | return self.right.search(point) 255 | return self.left.search(point) 256 | 257 | @property 258 | def leaves(self): 259 | if self.right is None: 260 | return 1 261 | return self.right.leaves + self.left.leaves 262 | -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/cosmik/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.mixture import GaussianMixture 4 | from tuprolog.theory import Theory 5 | 6 | from psyke import Target, Extractor, get_default_random_seed 7 | from psyke.clustering.utils import select_gaussian_mixture 8 | from psyke.extraction.hypercubic import HyperCube, HyperCubeExtractor, RegressionCube 9 | 10 | 11 | class COSMiK(HyperCubeExtractor): 12 | """ 13 | Explanator implementing COSMiK algorithm. 14 | """ 15 | 16 | def __init__(self, predictor, max_components: int = 4, k: int = 5, patience: int = 15, close_to_center: bool = True, 17 | output: Target = Target.CONSTANT, discretization=None, normalization=None, 18 | seed: int = get_default_random_seed()): 19 | super().__init__(predictor, Target.REGRESSION, discretization, normalization) 20 | self.max = max_components 21 | self.k = k 22 | self.patience = patience 23 | self.output = output 24 | self.close_to_center = close_to_center 25 | self.seed = seed 26 | 27 | def _extract(self, dataframe: pd.DataFrame) -> Theory: 28 | np.random.seed(self.seed) 29 | X, y = dataframe.iloc[:, :-1], dataframe.iloc[:, -1] 30 | 31 | _, n, _ = select_gaussian_mixture(dataframe, self.max) 32 | gmm = GaussianMixture(n) 33 | gmm.fit(X, y) 34 | 35 | divine = Extractor.divine(gmm, self.k, self.patience, self.close_to_center, 36 | self.discretization, self.normalization) 37 | df = X.join(pd.DataFrame(gmm.predict(X))) 38 | df.columns = dataframe.columns 39 | divine.extract(df) 40 | 41 | self._hypercubes = [HyperCube(cube.dimensions.copy()) if self.output == Target.CONSTANT else 42 | RegressionCube(cube.dimensions.copy()) for cube in divine._hypercubes] 43 | for cube in self._hypercubes: 44 | cube.update(dataframe, self.predictor) 45 | 46 | self._sort_cubes() 47 | return self._create_theory(dataframe) -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/creepy/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections import Iterable 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.base import ClassifierMixin 7 | from tuprolog.theory import Theory 8 | from psyke import Clustering 9 | from psyke.clustering import HyperCubeClustering 10 | from psyke.extraction.hypercubic import HyperCubeExtractor 11 | from psyke.utils import Target, get_default_random_seed 12 | 13 | 14 | class CReEPy(HyperCubeExtractor): 15 | """ 16 | Explanator implementing CReEPy algorithm. 17 | """ 18 | 19 | def __init__(self, predictor, clustering=Clustering.exact, depth: int = 3, error_threshold: float = 0.1, 20 | output: Target = Target.CONSTANT, gauss_components: int = 5, ranks: Iterable[(str, float)] = tuple(), 21 | ignore_threshold: float = 0.0, discretization=None, normalization=None, 22 | seed: int = get_default_random_seed()): 23 | super().__init__(predictor, Target.CLASSIFICATION if isinstance(predictor, ClassifierMixin) else output, 24 | discretization, normalization) 25 | self.clustering = clustering(depth, error_threshold, self._output, gauss_components, discretization, 26 | normalization, seed) 27 | self._default_surrounding_cube = True 28 | self._dimensions_to_ignore = set([dimension for dimension, relevance in ranks if relevance < ignore_threshold]) 29 | 30 | def _extract(self, dataframe: pd.DataFrame) -> Theory: 31 | if not isinstance(self.clustering, HyperCubeClustering): 32 | raise TypeError("clustering must be a HyperCubeClustering") 33 | 34 | self.clustering.fit(dataframe) 35 | self._hypercubes = self.clustering.get_hypercubes() 36 | self._surrounding = self._hypercubes[-1] 37 | return self._create_theory(dataframe) 38 | -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/divine/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from tuprolog.theory import Theory 4 | 5 | from psyke import Target, get_default_random_seed 6 | from psyke.extraction.hypercubic import HyperCubeExtractor 7 | from psyke.extraction.hypercubic.hypercube import Point, GenericCube, HyperCube 8 | 9 | from sklearn.neighbors import BallTree 10 | 11 | 12 | class DiViNE(HyperCubeExtractor): 13 | """ 14 | Explanator implementing DiViNE algorithm. 15 | """ 16 | 17 | def __init__(self, predictor, k: int = 5, patience: int = 15, close_to_center: bool = True, 18 | discretization=None, normalization=None, seed: int = get_default_random_seed()): 19 | super().__init__(predictor, Target.CLASSIFICATION, discretization, normalization) 20 | self.k = k 21 | self.patience = patience 22 | self.vicinity_function = DiViNE.closest_to_center if close_to_center else DiViNE.closest_to_corners 23 | self.seed = seed 24 | 25 | @staticmethod 26 | def __pop(data: pd.DataFrame, idx: int = None) -> (Point, pd.DataFrame): 27 | if idx is None: 28 | idx = data.sample(1).index.values[0] 29 | t = data.T 30 | return DiViNE.__to_point(t.pop(idx)), t.T.reset_index(drop=True) 31 | 32 | @staticmethod 33 | def __to_point(instance) -> Point: 34 | point = Point(instance.index.values, instance.values) 35 | return point 36 | 37 | def __to_cube(self, point: Point) -> GenericCube: 38 | cube = HyperCube.cube_from_point(point.dimensions, self._output) 39 | cube._output = list(point.dimensions.values())[-1] 40 | return cube 41 | 42 | def __clean(self, data: pd.DataFrame) -> pd.DataFrame: 43 | _, idx = BallTree(data.iloc[:, :-1]).query(data.iloc[:, :-1], k=self.k) 44 | # how many output classes are associated with the k neighbors 45 | count = np.array(list(map(lambda indices: len(data.iloc[indices].iloc[:, -1].unique()), idx))) 46 | # instances with neighbors of different classes are discarded 47 | return data[count == 1] 48 | 49 | def __closest(self, data: pd.DataFrame, cube: GenericCube) -> (Point, pd.DataFrame): 50 | return DiViNE.__pop(data, self.vicinity_function(BallTree(data.iloc[:, :-1]), cube)) 51 | 52 | @staticmethod 53 | def closest_to_center(tree: BallTree, cube: GenericCube): 54 | return tree.query([list(cube.center.dimensions.values())], k=1)[1][0][-1] 55 | 56 | @staticmethod 57 | def closest_to_corners(tree: BallTree, cube: GenericCube): 58 | distance, idx = tree.query([list(point.dimensions.values()) for point in cube.corners()], k=1) 59 | return idx[np.argmin(distance)][-1] 60 | 61 | def _extract(self, dataframe: pd.DataFrame) -> Theory: 62 | self._surrounding = HyperCube.create_surrounding_cube(dataframe, output=Target.CLASSIFICATION) 63 | np.random.seed(self.seed) 64 | data = self.__clean(dataframe) 65 | 66 | while len(data) > 0: 67 | discarded = [] 68 | patience = self.patience 69 | point, data = self.__pop(data) 70 | cube = self.__to_cube(point) 71 | 72 | while patience > 0 and len(data) > 0: 73 | other, data = self.__closest(data, cube) 74 | if cube.output == list(other.dimensions.values())[-1]: 75 | cube = cube.merge_with_point(other) 76 | data = data[~(cube.filter_indices(data.iloc[:, :-1]))].reset_index(drop=True) 77 | else: 78 | patience -= 1 79 | discarded.append(other) 80 | if cube.volume() > 0: 81 | cube.update(dataframe, self.predictor) 82 | self._hypercubes.append(cube) 83 | if len(discarded) > 0: 84 | data = pd.concat([data] + [d.to_dataframe() for d in discarded]).reset_index(drop=True) 85 | self._sort_cubes() 86 | return self._create_theory(dataframe) 87 | -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/gridex/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from itertools import product 3 | from typing import Iterable 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.base import ClassifierMixin 7 | from tuprolog.theory import Theory 8 | from psyke import get_default_random_seed 9 | from psyke.utils import Target 10 | from psyke.extraction.hypercubic import HyperCubeExtractor, Grid, HyperCube 11 | 12 | 13 | class GridEx(HyperCubeExtractor): 14 | """ 15 | Explanator implementing GridEx algorithm, doi:10.1007/978-3-030-82017-6_2. 16 | """ 17 | 18 | def __init__(self, predictor, grid: Grid, min_examples: int, threshold: float, output: Target = Target.CONSTANT, 19 | discretization=None, normalization=None, seed: int = get_default_random_seed()): 20 | super().__init__(predictor, Target.CLASSIFICATION if isinstance(predictor, ClassifierMixin) else output, 21 | discretization, normalization) 22 | self.grid = grid 23 | self.min_examples = min_examples 24 | self.threshold = threshold 25 | np.random.seed(seed) 26 | 27 | def _extract(self, dataframe: pd.DataFrame) -> Theory: 28 | self._hypercubes = [] 29 | self._surrounding = HyperCube.create_surrounding_cube(dataframe, output=self._output) 30 | self._surrounding.init_diversity(2 * self.threshold) 31 | self._iterate(dataframe) 32 | return self._create_theory(dataframe) 33 | 34 | def _create_ranges(self, cube, iteration): 35 | ranges = {} 36 | for (feature, (a, b)) in cube.dimensions.items(): 37 | n_bins = self.grid.get(feature, iteration) 38 | if n_bins == 1: 39 | ranges[feature] = [(a, b)] 40 | self._dimensions_to_ignore.add(feature) 41 | else: 42 | size = (b - a) / n_bins 43 | ranges[feature] = [(a + size * i, a + size * (i + 1)) for i in range(n_bins)] 44 | return ranges 45 | 46 | def _cubes_to_split(self, cube, iteration, dataframe, fake, keep_empty=False): 47 | to_split = [] 48 | for p in product(*self._create_ranges(cube, iteration).values()): 49 | cube = self._default_cube() 50 | for i, f in enumerate(dataframe.columns[:-1]): 51 | cube.update_dimension(f, p[i]) 52 | n = cube.count(dataframe) 53 | if n > 0 or keep_empty: 54 | fake = pd.concat([fake, cube.create_samples(self.min_examples - n)]) 55 | cube.update(fake, self.predictor) 56 | to_split.append(cube) 57 | return to_split, fake 58 | 59 | def _iterate(self, dataframe: pd.DataFrame): 60 | fake = dataframe.copy() 61 | prev = [self._surrounding] 62 | next_iteration = [] 63 | 64 | for iteration in self.grid.iterate(): 65 | next_iteration = [] 66 | for cube in prev: 67 | if cube.count(dataframe) == 0: 68 | continue 69 | if cube.diversity < self.threshold: 70 | self._hypercubes += [cube] 71 | continue 72 | to_split, fake = self._cubes_to_split(cube, iteration, dataframe, fake) 73 | next_iteration += [c for c in self._merge(to_split, fake)] 74 | prev = next_iteration.copy() 75 | self._hypercubes += [cube for cube in next_iteration] 76 | 77 | @staticmethod 78 | def _find_couples(to_split: Iterable[HyperCube], not_in_cache: Iterable[HyperCube], 79 | adjacent_cache: dict[tuple[HyperCube, HyperCube], str | None]) -> \ 80 | Iterable[tuple[HyperCube, HyperCube, str]]: 81 | checked = [] 82 | eligible = [] 83 | for cube in to_split: 84 | checked.append(cube) 85 | for other_cube in [c for c in to_split if c not in checked]: 86 | if (cube in not_in_cache) or (other_cube in not_in_cache): 87 | adjacent_cache[(cube, other_cube)] = cube.is_adjacent(other_cube) 88 | adjacent_feature = adjacent_cache[(cube, other_cube)] 89 | eligible.append((cube, other_cube, adjacent_feature)) 90 | return [couple for couple in eligible if couple[2] is not None] 91 | 92 | def _evaluate_merge(self, not_in_cache: Iterable[HyperCube], 93 | dataframe: pd.DataFrame, feature: str, 94 | cube: HyperCube, other_cube: HyperCube, 95 | merge_cache: dict[(HyperCube, HyperCube), HyperCube | None]) -> bool: 96 | if (cube in not_in_cache) or (other_cube in not_in_cache): 97 | merged_cube = cube.merge_along_dimension(other_cube, feature) 98 | merged_cube.update(dataframe, self.predictor) 99 | merge_cache[(cube, other_cube)] = merged_cube 100 | return cube.output == other_cube.output if self._output == Target.CLASSIFICATION else \ 101 | merge_cache[(cube, other_cube)].diversity < self.threshold 102 | 103 | def _merge(self, to_split: Iterable[HyperCube], dataframe: pd.DataFrame) -> Iterable[HyperCube]: 104 | not_in_cache = [cube for cube in to_split] 105 | adjacent_cache = {} 106 | merge_cache = {} 107 | cont = True 108 | while cont: 109 | to_merge = [([cube, other_cube], merge_cache[(cube, other_cube)]) for cube, other_cube, feature in 110 | GridEx._find_couples(to_split, not_in_cache, adjacent_cache) if 111 | self._evaluate_merge(not_in_cache, dataframe, feature, cube, other_cube, merge_cache)] 112 | if len(to_merge) == 0: 113 | cont = False 114 | else: 115 | sorted(to_merge, key=lambda c: c[1].diversity) 116 | best = to_merge[0] 117 | to_split = [cube for cube in to_split if cube not in best[0]] + [best[1]] 118 | not_in_cache = [best[1]] 119 | return to_split 120 | 121 | def make_fair(self, features: Iterable[str]): 122 | self.grid.make_fair(features) 123 | -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/gridrex/__init__.py: -------------------------------------------------------------------------------- 1 | from psyke import get_default_random_seed, Target 2 | from psyke.extraction.hypercubic import Grid, RegressionCube 3 | from psyke.extraction.hypercubic.gridex import GridEx 4 | 5 | 6 | class GridREx(GridEx): 7 | """ 8 | Explanator implementing GridREx algorithm. 9 | """ 10 | 11 | def __init__(self, predictor, grid: Grid, min_examples: int, threshold: float, normalization, 12 | seed=get_default_random_seed()): 13 | super().__init__(predictor, grid, min_examples, threshold, Target.REGRESSION, None, normalization, seed) 14 | 15 | def _default_cube(self) -> RegressionCube: 16 | return RegressionCube() 17 | -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/hex/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Iterable 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from psyke import get_default_random_seed, Target 9 | from psyke.extraction.hypercubic import Grid, HyperCube, GenericCube, ClassificationCube 10 | from psyke.extraction.hypercubic.gridex import GridEx 11 | 12 | 13 | class HEx(GridEx): 14 | """ 15 | Explanator implementing HEx algorithm. 16 | """ 17 | 18 | class Node: 19 | def __init__(self, cube: GenericCube, parent: HEx.Node = None, threshold: float = None): 20 | self.cube = cube 21 | self.parent = parent 22 | self.children: Iterable[HEx.Node] = [] 23 | self.threshold = threshold 24 | self.gain = True if parent is None else self.check() 25 | 26 | def check(self) -> bool: 27 | other = self.parent 28 | try: 29 | while not other.gain: 30 | other = other.parent 31 | except AttributeError: 32 | return True 33 | if isinstance(other.cube, ClassificationCube): 34 | return other.cube.output != self.cube.output 35 | return other.cube.error - self.cube.error > self.threshold * .6 36 | 37 | def indices(self, dataframe: pd.DataFrame): 38 | return self.cube.filter_indices(dataframe.iloc[:, :-1]) 39 | 40 | def eligible_children(self, dataframe) -> Iterable[HEx.Node]: 41 | return [c for c in self.children if c.cube.count(dataframe) > 0] 42 | 43 | def permanent_children(self, dataframe) -> Iterable[HEx.Node]: 44 | return [c for c in self.eligible_children(dataframe) if c.gain] 45 | 46 | def permanent_indices(self, dataframe): 47 | return np.any([c.cube.filter_indices(dataframe.iloc[:, :-1]) 48 | for c in self.eligible_children(dataframe) if c.gain], axis=0) 49 | 50 | def update(self, dataframe: pd.DataFrame, predictor, recursive=False): 51 | if recursive: 52 | for node in self.children: 53 | node.update(dataframe, predictor, recursive) 54 | cleaned = [(c.cube, c.gain) for c in self.eligible_children(dataframe)] 55 | idx = self.permanent_indices(dataframe) 56 | 57 | if sum(g for _, g in cleaned) > 0 and sum(self.indices(dataframe)) > sum(idx) and self.gain: 58 | self.cube.update(dataframe[self.indices(dataframe) & ~idx], predictor) 59 | return cleaned 60 | 61 | def linearize(self, dataframe, depth=1): 62 | children = [c.linearize(dataframe, depth + 1) for c in self.permanent_children(dataframe)] 63 | return [(cc, dd) for c in children for cc, dd in c if c != []] + \ 64 | [(c, depth) for c in self.permanent_children(dataframe)] 65 | 66 | def __init__(self, predictor, grid: Grid, min_examples: int, threshold: float, output: Target = Target.CONSTANT, 67 | discretization=None, normalization=None, seed: int = get_default_random_seed()): 68 | super().__init__(predictor, grid, min_examples, threshold, output, discretization, normalization, seed) 69 | self._default_surrounding_cube = True 70 | 71 | def _gain(self, parent_cube: GenericCube, new_cube: GenericCube) -> float: 72 | if isinstance(parent_cube, ClassificationCube): 73 | return parent_cube.output != new_cube.output 74 | return parent_cube.error - new_cube.error > self.threshold * .6 75 | 76 | def _iterate(self, dataframe: pd.DataFrame): 77 | fake = dataframe.copy() 78 | self._surrounding.update(dataframe, self.predictor) 79 | root = HEx.Node(self._surrounding, threshold=self.threshold) 80 | current = [root] 81 | 82 | for iteration in self.grid.iterate(): 83 | next_iteration = [] 84 | for node in current: 85 | if node.cube.diversity < self.threshold: 86 | continue 87 | children, fake = self._cubes_to_split(node.cube, iteration, dataframe, fake, True) 88 | node.children = [HEx.Node(c, node, threshold=self.threshold) for c in children] 89 | cleaned = node.update(fake, self.predictor, False) 90 | node.children = [HEx.Node(c, node, threshold=self.threshold) for c in self._merge( 91 | [c for c, _ in cleaned], fake)] 92 | next_iteration += [n for n in node.children] 93 | 94 | current = next_iteration.copy() 95 | _ = root.update(fake, self.predictor, True) 96 | self._hypercubes = [] 97 | linearized = root.linearize(fake) 98 | for depth in sorted(np.unique([d for (_, d) in linearized]), reverse=True): 99 | self._hypercubes += self._merge([c.cube for (c, d) in linearized if d == depth], fake) 100 | 101 | if len(self._hypercubes) == 0: 102 | self._hypercubes = [self._surrounding] 103 | elif not min(np.any([c.filter_indices(dataframe.iloc[:, :-1]) for c in self._hypercubes], axis=0)): 104 | self._hypercubes = self._hypercubes + [self._surrounding] 105 | -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/iter/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Iterable 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.base import ClassifierMixin 6 | from tuprolog.theory import Theory 7 | from psyke.extraction.hypercubic import HyperCube, HyperCubeExtractor 8 | from psyke.extraction.hypercubic.hypercube import GenericCube 9 | from psyke.extraction.hypercubic.utils import MinUpdate, Expansion 10 | from psyke.utils import get_default_random_seed, Target 11 | 12 | 13 | class ITER(HyperCubeExtractor): 14 | """ 15 | Explanator implementing ITER algorithm, doi:10.1007/11823728_26. 16 | """ 17 | 18 | def __init__(self, predictor, min_update, n_points, max_iterations, min_examples, threshold, fill_gaps, 19 | ignore_dimensions: Iterable, normalization, output: Target = Target.CONSTANT, 20 | seed=get_default_random_seed()): 21 | super().__init__(predictor, output, normalization=normalization) 22 | if output is Target.REGRESSION: 23 | raise NotImplementedError 24 | self.predictor = predictor 25 | self.min_update = min_update 26 | self._init_points = n_points 27 | self.n_points = n_points 28 | self.max_iterations = max_iterations 29 | self.min_examples = min_examples 30 | self.threshold = threshold 31 | self.fill_gaps = fill_gaps 32 | self._output = Target.CLASSIFICATION if isinstance(predictor, ClassifierMixin) else \ 33 | output if output is not None else Target.CONSTANT 34 | self.seed = seed 35 | self.ignore_dimensions = ignore_dimensions if ignore_dimensions is not None else [] 36 | 37 | def make_fair(self, features: Iterable[str]): 38 | self.n_points = self._init_points 39 | self.ignore_dimensions += list(features) 40 | 41 | def _best_cube(self, dataframe: pd.DataFrame, cube: GenericCube, cubes: Iterable[Expansion]) -> Expansion | None: 42 | expansions = [] 43 | for limit in cubes: 44 | count = limit.cube.count(dataframe) 45 | dataframe = pd.concat([dataframe, limit.cube.create_samples(self.min_examples - count)]) 46 | limit.cube.update(dataframe, self.predictor) 47 | expansions.append(Expansion( 48 | limit.cube, limit.feature, limit.direction, 49 | abs(cube.output - limit.cube.output) if self._output is Target.CONSTANT else 50 | 1 - int(cube.output == limit.cube.output) 51 | )) 52 | if len(expansions) > 0: 53 | return sorted(expansions, key=lambda e: e.distance)[0] 54 | return None 55 | 56 | def _calculate_min_updates(self) -> Iterable[MinUpdate]: 57 | return [MinUpdate(name, (interval[1] - interval[0]) * self.min_update) for (name, interval) in 58 | self._surrounding.dimensions.items()] 59 | 60 | def _create_range(self, cube: GenericCube, min_updates: Iterable[MinUpdate], feature: str, direction: str)\ 61 | -> tuple[GenericCube, tuple[float, float]]: 62 | a, b = cube[feature] 63 | size = [min_update for min_update in min_updates if min_update.name == feature][0].value 64 | return (cube.copy(), (max(a - size, self._surrounding.get_first(feature)), a) 65 | if direction == '-' else (b, min(b + size, self._surrounding.get_second(feature)))) 66 | 67 | def _create_temp_cube(self, cube: GenericCube, min_updates: Iterable[MinUpdate], 68 | hypercubes: Iterable[GenericCube], feature: str, 69 | direction: str) -> Iterable[Expansion]: 70 | temp_cube, values = self._create_range(cube, min_updates, feature, direction) 71 | temp_cube.update_dimension(feature, values) 72 | overlap = temp_cube.overlap(hypercubes) 73 | while (overlap is not None) & (temp_cube.has_volume()): 74 | overlap = ITER._resolve_overlap(temp_cube, overlap, hypercubes, feature, direction) 75 | if (temp_cube.has_volume() & (overlap is None)) & (all(temp_cube != cube for cube in hypercubes)): 76 | yield Expansion(temp_cube, feature, direction) 77 | else: 78 | cube.add_limit(feature, direction) 79 | 80 | def _create_temp_cubes(self, cube: GenericCube, min_updates: Iterable[MinUpdate], 81 | hypercubes: Iterable[GenericCube]) -> Iterable[Expansion]: 82 | tmp_cubes = [] 83 | for feature in self._surrounding.dimensions.keys(): 84 | if feature in self.ignore_dimensions: 85 | continue 86 | limit = cube.check_limits(feature) 87 | if limit == '*': 88 | continue 89 | for x in {'-', '+'} - {limit}: 90 | tmp_cubes += self._create_temp_cube(cube, min_updates, hypercubes, feature, x) 91 | return tmp_cubes 92 | 93 | def _cubes_to_update(self, dataframe: pd.DataFrame, to_expand: Iterable[GenericCube], 94 | hypercubes: Iterable[GenericCube], min_updates: Iterable[MinUpdate]) \ 95 | -> Iterable[tuple[GenericCube, Expansion]]: 96 | results = [(hypercube, self._best_cube(dataframe, hypercube, self._create_temp_cubes( 97 | hypercube, min_updates, hypercubes))) for hypercube in to_expand] 98 | return sorted([result for result in results if result[1] is not None], key=lambda x: x[1].distance) 99 | 100 | def _expand_or_create(self, cube: GenericCube, expansion: Expansion, hypercubes: Iterable[GenericCube]) -> None: 101 | if expansion.distance > self.threshold: 102 | hypercubes += [expansion.cube] 103 | else: 104 | cube.expand(expansion, hypercubes) 105 | 106 | @staticmethod 107 | def _find_closer_sample(dataframe: pd.DataFrame, output: float | str) -> dict[str, float]: 108 | if isinstance(output, str): 109 | close_sample = dataframe[dataframe.iloc[:, -1] == output].iloc[0].to_dict() 110 | else: 111 | difference = abs(dataframe.iloc[:, -1] - output) 112 | close_sample = dataframe[difference == min(difference)].iloc[0].to_dict() 113 | return close_sample 114 | 115 | def _generate_starting_points(self, dataframe: pd.DataFrame) -> Iterable[GenericCube]: 116 | if self.n_points <= 0: 117 | raise (Exception('InvalidAttributeValueException')) 118 | points: Iterable[float] 119 | if isinstance(dataframe.iloc[0, -1], str): 120 | classes = np.unique(dataframe.iloc[:, -1].values) 121 | points = [classes[i] for i in range(min(self.n_points, len(classes)))] 122 | else: 123 | desc = dataframe.iloc[:, -1].describe() 124 | min_output, max_output = desc["min"], desc["max"] 125 | points = [(max_output - min_output) / 2] if self.n_points == 1 else \ 126 | [min_output + (max_output - min_output) / (self.n_points - 1) * index for index in range(self.n_points)] 127 | return [HyperCube.cube_from_point(ITER._find_closer_sample(dataframe, point), output=self._output) 128 | for point in points] 129 | 130 | def _initialize(self, dataframe: pd.DataFrame) -> Iterable[MinUpdate]: 131 | self._fake_dataframe = dataframe.copy() 132 | self._surrounding = HyperCube.create_surrounding_cube(dataframe, output=self._output) 133 | min_updates = self._calculate_min_updates() 134 | self._init_hypercubes(dataframe, min_updates) 135 | for hypercube in self._hypercubes: 136 | hypercube.update(dataframe, self.predictor) 137 | return min_updates 138 | 139 | def _init_hypercubes(self, dataframe: pd.DataFrame, min_updates: Iterable[MinUpdate]): 140 | while True: 141 | hypercubes = self._generate_starting_points(dataframe) 142 | for hypercube in hypercubes: 143 | hypercube.expand_all(min_updates, self._surrounding) 144 | for d in self.ignore_dimensions: 145 | hypercube[d] = self._surrounding[d] 146 | self.n_points = self.n_points - 1 147 | if not HyperCube.check_overlap(hypercubes, hypercubes): 148 | break 149 | self._hypercubes = hypercubes 150 | 151 | def _iterate(self, dataframe: pd.DataFrame, hypercubes: Iterable[GenericCube], min_updates: Iterable[MinUpdate], 152 | left_iteration: int) -> int: 153 | np.random.seed(self.seed) 154 | iterations = 0 155 | to_expand = [cube for cube in hypercubes if cube.limit_count < (len(dataframe.columns) - 1) * 2] 156 | while (len(to_expand) > 0) and (iterations < left_iteration): 157 | updates = list(self._cubes_to_update(dataframe, to_expand, hypercubes, min_updates)) 158 | if len(updates) > 0: 159 | self._expand_or_create(updates[0][0], updates[0][1], hypercubes) 160 | iterations += 1 161 | to_expand = [cube for cube in hypercubes if cube.limit_count < (len(dataframe.columns) - 1) * 2] 162 | return iterations 163 | 164 | @staticmethod 165 | def _resolve_overlap(cube: GenericCube, overlapping_cube: GenericCube, hypercubes: Iterable[GenericCube], 166 | feature: str, direction: str) -> GenericCube: 167 | a, b = cube[feature] 168 | cube.update_dimension(feature, max(overlapping_cube.get_second(feature), a) if direction == '-' else a, 169 | min(overlapping_cube.get_first(feature), b) if direction == '+' else b) 170 | return cube.overlap(hypercubes) 171 | 172 | def _extract(self, dataframe: pd.DataFrame) -> Theory: 173 | min_updates = self._initialize(dataframe) 174 | temp_train = dataframe.copy() 175 | fake = dataframe.copy() 176 | iterations = 0 177 | while temp_train.shape[0] > 0: 178 | iterations += self._iterate(fake, self._hypercubes, min_updates, self.max_iterations - iterations) 179 | if (iterations >= self.max_iterations) or (not self.fill_gaps): 180 | break 181 | temp_train = temp_train.iloc[[p is None for p in self.predict(temp_train.iloc[:, :-1])]] 182 | if temp_train.shape[0] > 0: 183 | point, ratio, overlap, new_cube = temp_train.iloc[0].to_dict(), 1.0, True, None 184 | temp_train = temp_train.drop([temp_train.index[0]]) 185 | while overlap is not None: 186 | if new_cube is not None: 187 | if not new_cube.has_volume(): 188 | break 189 | new_cube = HyperCube.cube_from_point(point, self._output) 190 | new_cube.expand_all(min_updates, self._surrounding, ratio) 191 | overlap = new_cube.overlap(self._hypercubes) 192 | ratio *= 2 193 | if new_cube.has_volume(): 194 | self._hypercubes += [new_cube] 195 | return self._create_theory(dataframe) 196 | -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/strategy.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from functools import reduce 4 | from collections import Iterable 5 | 6 | 7 | class Strategy: 8 | def __init__(self, partitions = None): 9 | self._partitions = partitions 10 | self._no_features = [] 11 | 12 | def get(self, feature: str) -> int: 13 | raise NotImplementedError 14 | 15 | def make_fair(self, features: Iterable[str]): 16 | self._no_features = features 17 | 18 | def partition_number(self, features: Iterable[str]) -> int: 19 | return reduce(lambda x, y: x * y, map(self.get, features), 1) 20 | 21 | def equals(self, strategy, features: Iterable[str]) -> bool: 22 | eq = True 23 | for f in features: 24 | eq = eq and self.get(f) == strategy.get(f) 25 | return eq 26 | 27 | def __str__(self): 28 | return self._partitions 29 | 30 | def __repr__(self): 31 | return self.__str__() 32 | 33 | 34 | class FixedStrategy(Strategy): 35 | def __init__(self, partitions: int = 2): 36 | super().__init__(partitions) 37 | 38 | def get(self, feature: str) -> int: 39 | return 1 if feature in self._no_features else self._partitions 40 | 41 | def __str__(self): 42 | return "Fixed ({})".format(super().__str__()) 43 | 44 | 45 | class AdaptiveStrategy(Strategy): 46 | def __init__(self, features: Iterable[(str, float)], partitions: Iterable[tuple[float, float]] | None = None): 47 | super().__init__(partitions if partitions is not None else [(0.33, 2), (0.67, 3)]) 48 | self.features = features 49 | 50 | def get(self, feature: str) -> int: 51 | if feature in self._no_features: 52 | return 1 53 | importance = next(filter(lambda t: t[0] == feature, self.features))[1] 54 | n = 1 55 | for (imp, part) in self._partitions: 56 | if importance >= imp: 57 | n = part 58 | else: 59 | break 60 | return n 61 | 62 | def __str__(self): 63 | return "Adaptive ({})".format(super().__str__()) 64 | -------------------------------------------------------------------------------- /psyke/extraction/hypercubic/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import math 3 | import warnings 4 | 5 | warnings.simplefilter("ignore") 6 | 7 | Dimension = tuple[float, float] 8 | Dimensions = dict[str, Dimension] 9 | 10 | 11 | class Expansion: 12 | 13 | def __init__(self, cube, feature: str, direction: str, distance: float = math.nan): 14 | self.cube = cube 15 | self.feature = feature 16 | self.direction = direction 17 | self.distance = distance 18 | 19 | def __getitem__(self, index: int) -> float: 20 | return self.cube[self.feature][index] 21 | 22 | def boundaries(self, a: float, b: float) -> (float, float): 23 | return (self[0], b) if self.direction == '-' else (a, self[1]) 24 | 25 | 26 | class Limit: 27 | 28 | def __init__(self, feature: str, direction: str): 29 | self.feature = feature 30 | self.direction = direction 31 | 32 | def __eq__(self, other): 33 | return (self.feature == other.feature) and (self.direction == other.direction) 34 | 35 | def __hash__(self): 36 | return hash(self.feature + self.direction) 37 | 38 | 39 | class MinUpdate: 40 | 41 | def __init__(self, name: str, value: float): 42 | self.name = name 43 | self.value = value 44 | 45 | 46 | class ZippedDimension: 47 | 48 | def __init__(self, name: str, this_dimension: Dimension, other_dimension: Dimension): 49 | self.name = name 50 | self.this_dimension = this_dimension 51 | self.other_dimension = other_dimension 52 | 53 | def __eq__(self, other: ZippedDimension) -> bool: 54 | return (self.name == other.name) and (self.this_dimension == other.this_dimension) and \ 55 | (self.other_dimension == other.other_dimension) 56 | 57 | -------------------------------------------------------------------------------- /psyke/extraction/real/__init__.py: -------------------------------------------------------------------------------- 1 | from functools import lru_cache 2 | from psyke.extraction import PedagogicalExtractor 3 | from psyke.extraction.real.utils import Rule, IndexedRuleSet 4 | from psyke.schema import DiscreteFeature 5 | from psyke.utils.dataframe import HashableDataFrame 6 | from psyke.utils.logic import create_term, create_head, create_variable_list 7 | from tuprolog.core import Var, Struct, Clause, clause 8 | from tuprolog.theory import MutableTheory, mutable_theory, Theory 9 | from typing import Iterable 10 | import pandas as pd 11 | import numpy as np 12 | 13 | 14 | class REAL(PedagogicalExtractor): 15 | """ 16 | Explanator implementing Rule Extraction As Learning (REAL) algorithm, doi:10.1016/B978-1-55860-335-6.50013-1. 17 | The algorithm is sensible to features' order in the provided dataset during extraction. 18 | """ 19 | 20 | def __init__(self, predictor, discretization: Iterable[DiscreteFeature]): 21 | super().__init__(predictor, discretization) 22 | self._ignore_feature = [] 23 | self._ruleset: IndexedRuleSet = IndexedRuleSet() 24 | 25 | @property 26 | def n_rules(self): 27 | return len(self._ruleset.flatten()) 28 | 29 | def _covers(self, sample: pd.Series, rules: list[Rule]) -> bool: 30 | new_rule = self._rule_from_example(sample) 31 | return any([new_rule in rule for rule in rules]) 32 | 33 | def _body(self, variables: dict[str, Var], rule: Rule) -> list[Struct]: 34 | result = [] 35 | for predicates, truth_value in zip(rule.to_lists(), [True, False]): 36 | for predicate in predicates: 37 | feature = [feature for feature in self.discretization if predicate in feature.admissible_values][0] 38 | result.append(create_term(variables[feature.name], feature.admissible_values[predicate], truth_value)) 39 | return result 40 | 41 | def _create_clause(self, dataset: pd.DataFrame, variables: dict[str, Var], key: int, rule: Rule) -> Clause: 42 | return clause(create_head(dataset.columns[-1], list(variables.values()), key), self._body(variables, rule)) 43 | 44 | def _create_new_rule(self, sample: pd.Series) -> Rule: 45 | rule = self._rule_from_example(sample) 46 | return self._generalise(rule, sample) 47 | 48 | def _create_ruleset(self, dataset: pd.DataFrame) -> IndexedRuleSet: 49 | ruleset = IndexedRuleSet.create_indexed_ruleset(sorted(set(dataset.iloc[:, -1]))) 50 | for _, sample in dataset.iloc[:, :-1].iterrows(): 51 | prediction = list(self.predictor.predict(sample.to_frame().transpose()))[0] 52 | rules = ruleset.get(prediction) 53 | if not self._covers(sample, rules): 54 | rules.append(self._create_new_rule(sample)) 55 | return ruleset.optimize() 56 | 57 | def _create_theory(self, dataset: pd.DataFrame) -> MutableTheory: 58 | theory = mutable_theory() 59 | for key, rule in self._ruleset.flatten(): 60 | variables = create_variable_list(self.discretization) 61 | theory.assertZ(self._create_clause(dataset, variables, key, rule)) 62 | return theory 63 | 64 | def _generalise(self, rule: Rule, sample: pd.Series) -> Rule: 65 | mutable_rule = rule.to_lists() 66 | samples = sample.to_frame().transpose() 67 | for predicate in rule.true_predicates: 68 | samples = self._remove_antecedent(samples.copy(), predicate, mutable_rule) 69 | return Rule(mutable_rule[0], mutable_rule[1]).reduce(self.discretization) 70 | 71 | def _remove_antecedent(self, samples: pd.DataFrame, predicate: str, rule: list[list[str]]) -> (pd.DataFrame, bool): 72 | feature = [feature for feature in self.discretization if predicate in feature.admissible_values][0] 73 | output = np.array(self.predictor.predict(samples)) 74 | copies = [samples.copy()] 75 | samples[predicate] = 0 76 | for f in [f for f in feature.admissible_values if f != predicate]: 77 | copy = samples.copy() 78 | copy[f] = 1 79 | if all(output == np.array(self.predictor.predict(copy))): 80 | copies.append(copy) 81 | rule[1].remove(f) 82 | if len(copies) > 1: 83 | rule[0].remove(predicate) 84 | return pd.concat([df for df in copies], ignore_index=True) 85 | 86 | @lru_cache(maxsize=512) 87 | def _get_or_set(self, dataset: HashableDataFrame) -> IndexedRuleSet: 88 | return self._create_ruleset(dataset) 89 | 90 | def _internal_predict(self, sample: pd.Series): 91 | x = [index for index, rule in self._ruleset.flatten() if self._rule_from_example(sample) in rule] 92 | return x[0] if x else None 93 | 94 | def make_fair(self, features: Iterable[str]): 95 | self._ignore_feature = [list(i.admissible_values.keys()) for i in self.discretization if i.name in features] \ 96 | if self.discretization else [features] 97 | self._ignore_feature = [feature for features in self._ignore_feature for feature in features] 98 | self._get_or_set.cache_clear() 99 | 100 | def _rule_from_example(self, sample: pd.Series) -> Rule: 101 | true_predicates, false_predicates = [], [] 102 | for feature, value in sample.items(): 103 | if feature in self._ignore_feature: 104 | continue 105 | true_predicates.append(str(feature)) if value == 1 else false_predicates.append(str(feature)) 106 | return Rule(true_predicates, false_predicates) 107 | 108 | def _subset(self, samples: pd.DataFrame, predicate: str) -> (pd.DataFrame, bool): 109 | samples_0 = samples.copy() 110 | samples_0[predicate].values[:] = 0 111 | samples_1 = samples.copy() 112 | samples_1[predicate].values[:] = 1 113 | samples_all = samples_0.append(samples_1) 114 | return samples_all, len(set(self.predictor.predict(samples_all))) == 1 115 | 116 | def _extract(self, dataframe: pd.DataFrame) -> Theory: 117 | self._ruleset = self._get_or_set(HashableDataFrame(dataframe)) 118 | return self._create_theory(dataframe) 119 | 120 | def _predict(self, dataframe) -> Iterable: 121 | return np.array([self._internal_predict(data.transpose()) for _, data in dataframe.iterrows()]) 122 | -------------------------------------------------------------------------------- /psyke/extraction/real/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from psyke import DiscreteFeature 3 | from typing import Iterable 4 | import pandas as pd 5 | 6 | 7 | class Rule: 8 | 9 | def __init__(self, true_predicates: list[str], false_predicates: list[str]): 10 | self.true_predicates = true_predicates 11 | self.false_predicates = false_predicates 12 | 13 | def __contains__(self, other: Rule) -> bool: 14 | return all([predicate in other.true_predicates for predicate in self.true_predicates]) and\ 15 | all([predicate in other.false_predicates for predicate in self.false_predicates]) 16 | 17 | def __eq__(self, other: Rule) -> bool: 18 | return self.true_predicates == other.true_predicates and self.false_predicates == other.false_predicates 19 | 20 | def __hash__(self) -> int: 21 | return hash(self.true_predicates) + hash(self.false_predicates) 22 | 23 | def reduce(self, features: Iterable[DiscreteFeature]) -> Rule: 24 | to_be_removed = [item for tp in self.true_predicates 25 | for feature in features if tp in feature.admissible_values 26 | for item in feature.admissible_values.keys()] 27 | return Rule(self.true_predicates, [fp for fp in self.false_predicates if fp not in to_be_removed]) 28 | 29 | def to_lists(self) -> list[list[str]]: 30 | return [self.true_predicates.copy(), self.false_predicates.copy()] 31 | 32 | 33 | class IndexedRuleSet(dict[int, list[Rule]]): 34 | 35 | def flatten(self) -> list[tuple[int, Rule]]: 36 | return [(key, value) for key, values in self.items() for value in values] 37 | 38 | def optimize(self) -> IndexedRuleSet: 39 | useless_rules = [item for key, entry in self.items() for item in IndexedRuleSet._useless_rules(key, entry)] 40 | for rule in useless_rules: 41 | self[rule[0]].remove(rule[1]) 42 | return self 43 | 44 | @staticmethod 45 | def _useless_rules(key, rules: list[Rule]) -> list[(int, Rule)]: 46 | return [ 47 | (key, rule) for rule in rules 48 | if any(rule in other_rule for other_rule in rules if other_rule != rule) 49 | ] 50 | 51 | @staticmethod 52 | def create_indexed_ruleset(indices: Iterable) -> IndexedRuleSet: 53 | return IndexedRuleSet({i: [] for i in indices}) 54 | -------------------------------------------------------------------------------- /psyke/extraction/trepan/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from psyke.extraction import PedagogicalExtractor 3 | from psyke.extraction.trepan.utils import Node, Split, SplitLogic 4 | from psyke import DiscreteFeature 5 | from psyke.utils.logic import create_term, create_variable_list, create_head 6 | from psyke.utils.sorted import SortedList 7 | from tuprolog.core import Var, Struct, clause 8 | from tuprolog.theory import MutableTheory, mutable_theory, Theory 9 | from typing import Iterable, Union, Any 10 | import pandas as pd 11 | 12 | 13 | class Trepan(PedagogicalExtractor): 14 | 15 | def __init__(self, predictor, discretization: Iterable[DiscreteFeature], min_examples: int = 0, max_depth: int = 3, 16 | split_logic: SplitLogic = SplitLogic.DEFAULT): 17 | super().__init__(predictor, discretization) 18 | self._ignore_feature = [] 19 | self.min_examples = min_examples 20 | self.max_depth = max_depth 21 | self.split_logic = split_logic 22 | self._root: Node 23 | 24 | def make_fair(self, features: Iterable[str]): 25 | self._ignore_feature = [list(i.admissible_values.keys()) for i in self.discretization if i.name in features] \ 26 | if self.discretization else [features] 27 | self._ignore_feature = [feature for features in self._ignore_feature for feature in features] 28 | 29 | @property 30 | def n_rules(self): 31 | return sum(1 for _ in self._root) 32 | 33 | def _best_split(self, node: Node, names: Iterable[str]) -> Union[tuple[Node, Node], None]: 34 | if node.samples.shape[0] < self.min_examples: 35 | raise NotImplementedError() 36 | if node.n_classes == 1: 37 | return None 38 | splits = self._create_splits(node, names) 39 | return None if len(splits) == 0 or splits[0].children[0].depth > self.max_depth else splits[0].children 40 | 41 | def _compact(self): 42 | nodes = [self._root] 43 | while len(nodes) > 0: 44 | node = nodes.pop() 45 | for item in self._nodes_to_remove(node, nodes): 46 | node.children.remove(item) 47 | node.children += item.children 48 | 49 | def _create_body(self, variables: dict[str, Var], node: Node) -> Iterable[Struct]: 50 | result = [] 51 | for constraint, value in node.constraints: 52 | feature: DiscreteFeature = [d for d in self.discretization if constraint in d.admissible_values][0] 53 | result.append(create_term(variables[feature.name], feature.admissible_values[constraint], value == 1.0)) 54 | return result 55 | 56 | @staticmethod 57 | def _create_samples(node: Node, column: str, value: float) -> pd.DataFrame: 58 | return node.samples.loc[node.samples[column] == value] 59 | 60 | @staticmethod 61 | def _create_split(node: Node, column: str) -> Union[Split, None]: 62 | true_examples = Trepan._create_samples(node, column, 1.0) 63 | false_examples = Trepan._create_samples(node, column, 0.0) 64 | true_constraints = list(node.constraints) + [(column, 1.0)] 65 | false_constraints = list(node.constraints) + [(column, 0.0)] 66 | true_node = Node(true_examples, node.n_examples, true_constraints, depth=node.depth + 1) \ 67 | if true_examples.shape[0] > 0 else None 68 | false_node = Node(false_examples, node.n_examples, false_constraints, depth=node.depth + 1) \ 69 | if false_examples.shape[0] > 0 else None 70 | return None if true_node is None or false_node is None else Split(node, (true_node, false_node)) 71 | 72 | def _create_splits(self, node: Node, names: Iterable[str]) -> SortedList[Split]: 73 | splits, constraints = Trepan._init_splits(node) 74 | for column in [column for column in names if column not in list(constraints) + self._ignore_feature]: 75 | split = Trepan._create_split(node, column) 76 | if split is not None: 77 | splits.add(split) 78 | return splits 79 | 80 | def _create_theory(self, name: str) -> MutableTheory: 81 | theory = mutable_theory() 82 | for node in self._root: 83 | variables = create_variable_list(self.discretization) 84 | theory.assertZ( 85 | clause( 86 | create_head(name, list(variables.values()), str(node.dominant)), 87 | self._create_body(variables, node) 88 | ) 89 | ) 90 | return theory 91 | 92 | def _init(self, dateset: pd.DataFrame) -> SortedList[Node]: 93 | self._root = Node(dateset, dateset.shape[0]) 94 | queue: SortedList[Node] = SortedList(lambda x, y: int(x.priority - y.priority)) 95 | queue.add(self._root) 96 | return queue 97 | 98 | @staticmethod 99 | def _init_splits(node: Node) -> tuple[SortedList[Split], Iterable[str]]: 100 | return SortedList(lambda x, y: int(x.priority - y.priority)),\ 101 | set(constraint[0] for constraint in node.constraints) 102 | 103 | @staticmethod 104 | def _nodes_to_remove(node: Node, nodes: list[Node]) -> list[Node]: 105 | to_remove = [] 106 | for child in node.children: 107 | if node.dominant == child.dominant and len(child.children) == 1: 108 | to_remove.append(child) 109 | nodes.append(node) 110 | else: 111 | nodes.append(child) 112 | return to_remove 113 | 114 | @staticmethod 115 | def _internal_predict(x: pd.Series, node: Node, categories: Iterable) -> Any: 116 | for child in node.children: 117 | skip = False 118 | for constraint, value in child.constraints: 119 | if x[constraint] != value: 120 | skip = True 121 | continue 122 | if not skip: 123 | return Trepan._internal_predict(x, child, categories) 124 | return node.dominant 125 | 126 | def _optimize(self) -> None: 127 | n, nodes = 0, [self._root] 128 | while len(nodes) > 0: 129 | n += Trepan._remove_nodes(nodes) 130 | self._compact() if n == 0 else self._optimize() 131 | 132 | @staticmethod 133 | def _remove_nodes(nodes: list[Node]) -> int: 134 | node = nodes.pop() 135 | to_remove = [child for child in node.children if len(child.children) == 0 and node.dominant == child.dominant] 136 | for child in to_remove: 137 | node.children.remove(child) 138 | for child in node.children: 139 | if len(child.children) > 0: 140 | nodes.append(child) 141 | return len(to_remove) 142 | 143 | def _extract(self, dataframe: pd.DataFrame) -> Theory: 144 | queue = self._init(dataframe) 145 | while len(queue) > 0: 146 | node = queue.pop() 147 | if self.split_logic == SplitLogic.DEFAULT: 148 | best: Union[tuple[Node, Node], None] = self._best_split(node, dataframe.columns[:-1]) 149 | if best is None: 150 | continue 151 | else: 152 | raise Exception('Illegal split logic') 153 | queue.add_all(best) 154 | node.children += list(best) 155 | self._optimize() 156 | return self._create_theory(dataframe.columns[-1]) 157 | 158 | def _predict(self, dataframe: pd.DataFrame) -> Iterable: 159 | return np.array( 160 | [Trepan._internal_predict(sample, self._root, dataframe.columns[-1]) for _, sample in dataframe.iterrows()] 161 | ) 162 | -------------------------------------------------------------------------------- /psyke/extraction/trepan/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from itertools import chain 3 | from typing import Iterable, Any 4 | import pandas as pd 5 | 6 | 7 | class Node: 8 | 9 | def __init__(self, samples: pd.DataFrame, n_examples: int, constraints: Iterable[tuple[str, float]] = None, 10 | children: list[Node] = None, depth: int = 0): 11 | self.samples = samples 12 | self.n_examples = n_examples 13 | self.constraints = [] if constraints is None else constraints 14 | self.children = [] if children is None else children 15 | self.depth = depth 16 | 17 | def __str__(self): 18 | name = ''.join(('' if c[1] > 0 else '!') + c[0] + ', ' for c in self.constraints) 19 | return name[:-2] + ' = ' + str(self.dominant) 20 | 21 | @property 22 | def priority(self) -> float: 23 | return -(self.reach * (1 - self.fidelity)) 24 | 25 | @property 26 | def fidelity(self) -> float: 27 | return 1.0 * self.correct / (self.samples.shape[0] if self.samples.shape[0] > 0 else 1) 28 | 29 | @property 30 | def reach(self) -> float: 31 | return 1.0 * self.samples.shape[0] / self.n_examples 32 | 33 | @property 34 | def correct(self) -> float: 35 | return sum(self.samples.iloc[:, -1] == self.dominant) 36 | 37 | @property 38 | def dominant(self) -> Any: 39 | return self.samples.iloc[:, -1].mode()[0] if self.samples.shape[0] > 0 else '' 40 | 41 | @property 42 | def n_classes(self) -> int: 43 | return len(set(self.samples.iloc[:, -1])) 44 | 45 | def __iter__(self) -> Iterable[Node]: 46 | for child in chain(*map(iter, self.children)): 47 | yield child 48 | yield self 49 | 50 | 51 | class Split: 52 | 53 | # TODO: should be configurable by user 54 | PRIORITY_BONUS: int = 100 55 | PRIORITY_PENALTY: int = 200 56 | 57 | def __init__(self, parent: Node, children: tuple[Node, Node]): 58 | self.parent = parent 59 | self.children = children 60 | 61 | @property 62 | def priority(self) -> float: 63 | return self.__priority(self.parent) 64 | 65 | def __priority(self, parent: Node) -> float: 66 | true_node, false_node = self.children 67 | priority = - (true_node.fidelity + false_node.fidelity) 68 | for node in [true_node, false_node]: 69 | priority -= self.PRIORITY_BONUS if parent.n_classes > node.n_classes else 0 70 | priority += self.PRIORITY_PENALTY if true_node.dominant == false_node.dominant else 0 71 | return priority 72 | 73 | 74 | class SplitLogic: 75 | 76 | DEFAULT = 1 77 | -------------------------------------------------------------------------------- /psyke/hypercubepredictor.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Iterable 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.neighbors import BallTree 8 | 9 | from psyke import EvaluableModel, Target, get_int_precision 10 | from psyke.extraction.hypercubic import RegressionCube, GenericCube, Point 11 | 12 | 13 | class HyperCubePredictor(EvaluableModel): 14 | def __init__(self, output=Target.CONSTANT, discretization=None, normalization=None): 15 | super().__init__(discretization, normalization) 16 | self._hypercubes = [] 17 | self._dimensions_to_ignore = set() 18 | self._output = output 19 | self._surrounding = None 20 | 21 | def _predict(self, dataframe: pd.DataFrame) -> Iterable: 22 | return np.array([self._predict_from_cubes(row.to_dict()) for _, row in dataframe.iterrows()]) 23 | 24 | def _brute_predict(self, dataframe: pd.DataFrame, criterion: str = 'corner', n: int = 2) -> Iterable: 25 | predictions = np.array(self._predict(dataframe)) 26 | idx = [prediction is None for prediction in predictions] 27 | if sum(idx) > 0: 28 | if criterion == 'default': 29 | predictions[idx] = np.array([HyperCubePredictor._get_cube_output( 30 | self._surrounding, row 31 | ) for _, row in dataframe[idx].iterrows()]) 32 | elif criterion == 'surface': 33 | predictions[idx] = np.array([HyperCubePredictor._get_cube_output(self._brute_predict_surface(row), row) 34 | for _, row in dataframe[idx].iterrows()]) 35 | else: 36 | tree, cubes = self._create_brute_tree(criterion, n) 37 | predictions[idx] = np.array([HyperCubePredictor._brute_predict_from_cubes( 38 | row.to_dict(), tree, cubes 39 | ) for _, row in dataframe[idx].iterrows()]) 40 | return np.array(predictions) 41 | 42 | @staticmethod 43 | def _brute_predict_from_cubes(row: dict[str, float], tree: BallTree, 44 | cubes: list[GenericCube]) -> float | str: 45 | idx = tree.query([list(row.values())], k=1)[1][0][0] 46 | return HyperCubePredictor._get_cube_output(cubes[idx], row) 47 | 48 | def _brute_predict_surface(self, row: pd.Series) -> GenericCube: 49 | return min([( 50 | cube.surface_distance(Point(list(row.keys()), list(row.values))), cube.volume(), cube 51 | ) for cube in self._hypercubes])[-1] 52 | 53 | def _create_brute_tree(self, criterion: str = 'center', n: int = 2) -> (BallTree, list[GenericCube]): 54 | admissible_criteria = ['surface', 'center', 'corner', 'perimeter', 'density', 'default'] 55 | if criterion not in admissible_criteria: 56 | raise NotImplementedError( 57 | "'criterion' should be chosen in " + str(admissible_criteria) 58 | ) 59 | 60 | points = [(cube.center, cube) for cube in self._hypercubes] if criterion == 'center' else \ 61 | [(cube.barycenter, cube) for cube in self._hypercubes] if criterion == 'density' else \ 62 | [(corner, cube) for cube in self._hypercubes for corner in cube.corners()] if criterion == 'corner' else \ 63 | [(point, cube) for cube in self._hypercubes for point in cube.perimeter_samples(n)] \ 64 | if criterion == 'perimeter' else None 65 | 66 | return BallTree(pd.concat([point[0].to_dataframe() for point in points], ignore_index=True)), \ 67 | [point[1] for point in points] 68 | 69 | def _predict_from_cubes(self, data: dict[str, float]) -> float | str | None: 70 | cube = self._find_cube(data) 71 | if cube is None: 72 | return None 73 | elif self._output == Target.CLASSIFICATION: 74 | return HyperCubePredictor._get_cube_output(cube, data) 75 | else: 76 | return round(HyperCubePredictor._get_cube_output(cube, data), get_int_precision()) 77 | 78 | def _find_cube(self, data: dict[str, float]) -> GenericCube | None: 79 | if not self._hypercubes: 80 | return None 81 | data = data.copy() 82 | for dimension in self._dimensions_to_ignore: 83 | if dimension in data: 84 | del data[dimension] 85 | for cube in self._hypercubes: 86 | if data in cube: 87 | return cube.copy() 88 | if self._hypercubes[-1].is_default: 89 | return self._hypercubes[-1].copy() 90 | 91 | @property 92 | def n_rules(self): 93 | return len(list(self._hypercubes)) 94 | 95 | @property 96 | def volume(self): 97 | return sum([cube.volume() for cube in self._hypercubes]) 98 | 99 | @staticmethod 100 | def _get_cube_output(cube, data: dict[str, float]) -> float: 101 | return cube.output.predict(pd.DataFrame([data])).flatten()[0] if \ 102 | isinstance(cube, RegressionCube) else cube.output 103 | -------------------------------------------------------------------------------- /psyke/tuning/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from enum import Enum 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from psyke.extraction.hypercubic import Grid 7 | from psyke.utils import Target 8 | 9 | 10 | class Objective(Enum): 11 | MODEL = 1, 12 | DATA = 2 13 | 14 | 15 | class Optimizer: 16 | def __init__(self, dataframe: pd.DataFrame, output: Target = Target.CONSTANT, max_error_increase: float = 1.2, 17 | min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, patience: int = 5, 18 | normalization=None, discretization=None): 19 | self.dataframe = dataframe 20 | self.output = output 21 | self.max_error_increase = max_error_increase 22 | self.min_rule_decrease = min_rule_decrease 23 | self.readability_tradeoff = readability_tradeoff 24 | self.patience = patience 25 | self.params = None 26 | self.normalization = normalization 27 | self.discretization = discretization 28 | 29 | def search(self): 30 | raise NotImplementedError 31 | 32 | def _best(self, params): 33 | param_dict = {self._score(t): t for t in params} 34 | min_param = min(param_dict) 35 | return min_param, param_dict[min_param] 36 | 37 | def _score(self, param): 38 | return param[0] * np.ceil(param[1] * self.readability_tradeoff) 39 | 40 | def _best_param(self, param): 41 | param_dict = {t[param]: t for t in self.params} 42 | min_param = min(param_dict) 43 | return min_param, param_dict[min_param] 44 | 45 | def get_best(self): 46 | names = ["Combined", "Predictive loss", "N rules"] 47 | params = [self._best(self.params), self._best_param(0), self._best_param(1)] 48 | for n, p in zip(names, params): 49 | self._print_params(n, p[1]) 50 | print() 51 | return self._best(self.params)[1], self._best_param(0)[1], self._best_param(1)[1] 52 | 53 | def _print_params(self, n, param): 54 | raise NotImplementedError 55 | 56 | 57 | class SKEOptimizer(Optimizer, ABC): 58 | def __init__(self, predictor, dataframe: pd.DataFrame, max_error_increase: float = 1.2, 59 | min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, patience: int = 5, 60 | objective: Objective = Objective.MODEL, output: Target = Target.CONSTANT, 61 | normalization=None, discretization=None): 62 | super().__init__(dataframe, output, max_error_increase, min_rule_decrease, readability_tradeoff, 63 | patience, normalization, discretization) 64 | self.predictor = predictor 65 | self.objective = objective 66 | 67 | 68 | class IterativeOptimizer(Optimizer, ABC): 69 | def __init__(self, dataframe: pd.DataFrame, max_error_increase: float = 1.2, 70 | min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, max_depth: int = 10, 71 | patience: int = 5, output: Target = Target.CONSTANT, normalization=None, discretization=None): 72 | super().__init__(dataframe, output, max_error_increase, min_rule_decrease, readability_tradeoff, 73 | patience, normalization, discretization) 74 | self.max_depth = max_depth 75 | 76 | def _iteration_improvement(self, best, other): 77 | if other[0] == best[0]: 78 | return (best[1] - other[1]) * 2 79 | return 1 / ( 80 | (1 - other[0] / best[0]) ** self.readability_tradeoff * 81 | np.ceil(other[1] / self.readability_tradeoff) / np.ceil(best[1] / self.readability_tradeoff) 82 | ) 83 | 84 | def _check_iteration_improvement(self, best, current): 85 | improvement = \ 86 | self._iteration_improvement([best[0], best[1]], [current[0], current[1]]) if best is not None else np.inf 87 | if isinstance(improvement, complex): 88 | improvement = 1.0 89 | return current, improvement < 1.2 90 | -------------------------------------------------------------------------------- /psyke/tuning/crash/__init__.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | import pandas as pd 4 | 5 | from psyke.tuning import Objective, SKEOptimizer 6 | from psyke.tuning.orchid import OrCHiD 7 | from psyke.utils import Target 8 | 9 | 10 | class CRASH(SKEOptimizer): 11 | class Algorithm(Enum): 12 | ExACT = 1, 13 | CREAM = 2 14 | 15 | def __init__(self, predictor, dataframe: pd.DataFrame, max_error_increase: float = 1.2, 16 | min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, max_depth: int = 10, 17 | max_gauss_components: int = 5, patience: int = 5, output: Target = Target.CONSTANT, 18 | objective: Objective = Objective.MODEL, normalization=None, discretization=None): 19 | super().__init__(predictor, dataframe, max_error_increase, min_rule_decrease, readability_tradeoff, 20 | patience, objective, output, normalization, discretization) 21 | self.max_depth = max_depth 22 | self.max_gauss_components = max_gauss_components 23 | 24 | def search(self): 25 | self.params = [] 26 | for algorithm in [OrCHiD.Algorithm.ExACT, OrCHiD.Algorithm.CREAM]: 27 | self.params += self.__search_algorithm(algorithm) 28 | 29 | def __search_algorithm(self, algorithm): 30 | params = [] 31 | best = None 32 | 33 | for gauss_components in range(2, self.max_gauss_components + 1): 34 | data = self.dataframe.sample(n=gauss_components * 100) if gauss_components * 100 < len(self.dataframe) \ 35 | else self.dataframe 36 | current_params = self.__search_components(data, algorithm, gauss_components) 37 | current_best = self._best(current_params)[1] 38 | if best is not None and self._score(best) <= self._score(current_best): 39 | break 40 | best = current_best 41 | params += current_params 42 | 43 | return params 44 | 45 | def __search_components(self, data, algorithm, gauss_components): 46 | orchid = OrCHiD(data, algorithm, self.output, self.max_error_increase, self.min_rule_decrease, 47 | self.readability_tradeoff, self.patience, self.max_depth, gauss_components, 48 | self.normalization, self.discretization) 49 | orchid.search() 50 | return [(*p, gauss_components, algorithm) for p in orchid.params] 51 | 52 | def _print_params(self, name, params): 53 | print("*****************************") 54 | print(f"Best {name}") 55 | print("*****************************") 56 | print(f"MAE = {params[0]:.2f}, {params[1]} rules") 57 | print(f"Algorithm = {params[5]}") 58 | print(f"Threshold = {params[3]:.2f}") 59 | print(f"Depth = {params[2]}") 60 | print(f"Gaussian components = {params[4]}") 61 | -------------------------------------------------------------------------------- /psyke/tuning/orchid/__init__.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from psyke import Clustering, EvaluableModel 7 | from psyke.tuning import Optimizer, IterativeOptimizer 8 | from psyke.utils import Target 9 | 10 | 11 | class OrCHiD(IterativeOptimizer): 12 | class Algorithm(Enum): 13 | ExACT = 1, 14 | CREAM = 2 15 | 16 | def __init__(self, dataframe: pd.DataFrame, algorithm, output: Target = Target.CONSTANT, 17 | max_error_increase: float = 1.2, min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, 18 | patience: int = 5, max_depth: int = 10, gauss_components=10, normalization=None, discretization=None): 19 | super().__init__(dataframe, max_error_increase, min_rule_decrease, readability_tradeoff, max_depth, patience, 20 | output, normalization, discretization) 21 | self.algorithm = algorithm 22 | self.gauss_components = gauss_components 23 | 24 | def search(self): 25 | self.params = self.__search_depth() 26 | 27 | def __search_depth(self): 28 | params, best = [], None 29 | 30 | for depth in range(1, self.max_depth + 1): 31 | current_params = self.__search_threshold(depth) 32 | current_best = self._best(current_params)[1] 33 | print() 34 | best, to_break = self._check_iteration_improvement(best, current_best) 35 | params += current_params 36 | 37 | if len(params) > 1 and to_break: 38 | break 39 | return params 40 | 41 | def __search_threshold(self, depth): 42 | step = 1.0 43 | threshold = 1.0 44 | params = [] 45 | patience = self.patience 46 | while patience > 0: 47 | print(f"{self.algorithm}. Depth: {depth}. Threshold = {threshold:.2f}. " 48 | f"Gaussian components = {self.gauss_components}. ", end="") 49 | clustering = (Clustering.cream if self.algorithm == OrCHiD.Algorithm.CREAM else Clustering.exact)( 50 | depth=depth, error_threshold=threshold, gauss_components=self.gauss_components, output=self.output 51 | ) 52 | clustering.fit(self.dataframe) 53 | task, metric = \ 54 | (EvaluableModel.Task.CLASSIFICATION, EvaluableModel.ClassificationScore.INVERSE_ACCURACY) \ 55 | if self.output == Target.CLASSIFICATION else \ 56 | (EvaluableModel.Task.REGRESSION, EvaluableModel.RegressionScore.MAE) 57 | p, n = clustering.score(self.dataframe, None, False, False, task=task, 58 | scoring_function=[metric])[metric][0], clustering.n_rules 59 | 60 | print(f"Predictive loss = {p:.2f}, {n} rules") 61 | 62 | if len(params) == 0: 63 | params.append((p, n, depth, threshold)) 64 | threshold = p / 20 65 | step = p / self.patience * 0.75 66 | continue 67 | 68 | if (n == 1) or (p == 0.0): 69 | params.append((p, n, depth, threshold)) 70 | break 71 | 72 | if p > params[0][0] * self.max_error_increase: 73 | break 74 | 75 | improvement = (params[-1][0] / p) + (1 - n / params[-1][1]) 76 | 77 | if improvement <= 1 or n > np.ceil(params[-1][1] * self.min_rule_decrease): 78 | patience -= 1 79 | if p != params[-1][0] or n != params[-1][1]: 80 | params.append((p, n, depth, threshold)) 81 | threshold += step 82 | return params 83 | 84 | def _print_params(self, name, params): 85 | print("*" * 40) 86 | print(f"* Best {name}") 87 | print("*" * 40) 88 | print(f"* Predictive loss = {params[0]:.2f}, {params[1]} rules") 89 | print(f"* Threshold = {params[3]:.2f}") 90 | print(f"* Depth = {params[2]}") 91 | print("*" * 40) 92 | -------------------------------------------------------------------------------- /psyke/tuning/pedro/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from enum import Enum 4 | 5 | from sklearn.metrics import accuracy_score 6 | 7 | from psyke import Extractor, Target 8 | from psyke.extraction.hypercubic import Grid, FeatureRanker 9 | from psyke.extraction.hypercubic.strategy import AdaptiveStrategy, FixedStrategy 10 | from psyke.tuning import Objective, IterativeOptimizer, SKEOptimizer 11 | 12 | 13 | class PEDRO(SKEOptimizer, IterativeOptimizer): 14 | class Algorithm(Enum): 15 | GRIDEX = 1, 16 | GRIDREX = 2, 17 | HEX = 3 18 | 19 | def __init__(self, predictor, dataframe: pd.DataFrame, max_error_increase: float = 1.2, 20 | min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, max_depth: int = 3, 21 | patience: int = 3, algorithm: Algorithm = Algorithm.GRIDREX, objective: Objective = Objective.MODEL, 22 | output: Target = Target.CONSTANT, normalization=None, discretization=None): 23 | SKEOptimizer.__init__(self, predictor, dataframe, max_error_increase, min_rule_decrease, 24 | readability_tradeoff, patience, objective, output, normalization, discretization) 25 | IterativeOptimizer.__init__(self, dataframe, max_error_increase, min_rule_decrease, readability_tradeoff, 26 | max_depth, patience, output, normalization, discretization) 27 | self.algorithm = Extractor.gridrex if algorithm == PEDRO.Algorithm.GRIDREX else \ 28 | Extractor.gridex if algorithm == PEDRO.Algorithm.GRIDEX else Extractor.hex 29 | self.algorithm_name = "GridREx" if algorithm == PEDRO.Algorithm.GRIDREX else \ 30 | "GridEx" if algorithm == PEDRO.Algorithm.GRIDEX else "HEx" 31 | self.ranked = FeatureRanker(dataframe.columns[:-1]).fit(predictor, dataframe.iloc[:, :-1]).rankings() 32 | predictions = self.predictor.predict(dataframe.iloc[:, :-1]).flatten() 33 | expected = self.dataframe.iloc[:, -1].values 34 | self.error = 1 - accuracy_score(predictions, expected) if output == Target.CLASSIFICATION else \ 35 | abs(predictions - expected).mean() 36 | 37 | def _search_depth(self, strategy, critical, max_partitions): 38 | params, best = [], None 39 | 40 | for iterations in range(self.max_depth): 41 | current_params = self.__search_threshold(Grid(iterations + 1, strategy), critical, max_partitions) 42 | current_best = self._best(current_params)[1] 43 | print() 44 | best, to_break = self._check_iteration_improvement(best, current_best) 45 | params += current_params 46 | 47 | if len(params) > 1 and to_break: 48 | break 49 | return params 50 | 51 | def __search_threshold(self, grid, critical, max_partitions): 52 | step = self.error / 2.0 53 | threshold = self.error * 0.5 54 | params = [] 55 | patience = self.patience 56 | while patience > 0: 57 | print("{}. {}. Threshold = {:.2f}. ".format(self.algorithm_name, grid, threshold), end="") 58 | param_dict = dict(min_examples=25, threshold=threshold, normalization=self.normalization) 59 | if self.algorithm != Extractor.gridrex: 60 | param_dict['output'] = self.output 61 | extractor = self.algorithm(self.predictor, grid, **param_dict) 62 | _ = extractor.extract(self.dataframe) 63 | error_function = (lambda *x: 1 - extractor.accuracy(*x)) if self.output == Target.CLASSIFICATION \ 64 | else extractor.mae 65 | error, n = (error_function(self.dataframe, self.predictor) if self.objective == Objective.MODEL else 66 | error_function(self.dataframe)), extractor.n_rules 67 | print("MAE = {:.2f}, {} rules".format(error, n)) 68 | 69 | if len(params) == 0: 70 | params.append((error, n, threshold, grid)) 71 | threshold += step 72 | continue 73 | 74 | if n > max_partitions: 75 | break 76 | 77 | if n == 1: 78 | params.append((error, n, threshold, grid)) 79 | break 80 | 81 | if error > params[0][0] * self.max_error_increase: 82 | break 83 | 84 | improvement = (params[-1][0] / error) + (1 - n / params[-1][1]) 85 | 86 | if improvement <= 1 or n > np.ceil(params[-1][1] * self.min_rule_decrease): 87 | patience -= 1 88 | step = max(step, abs(error - threshold) / max(patience, 1)) 89 | elif not critical: 90 | patience = self.patience 91 | if error != params[-1][0] or n != params[-1][1]: 92 | params.append((error, n, threshold, grid)) 93 | threshold += step 94 | return params 95 | 96 | def __contains(self, strategies, strategy): 97 | for s in strategies: 98 | if strategy.equals(s, self.dataframe.columns[:-1]): 99 | return True 100 | return False 101 | 102 | def search(self): 103 | max_partitions = 200 104 | base_partitions = FixedStrategy(2).partition_number(self.dataframe.columns[:-1]) * 3 105 | if base_partitions <= max_partitions: 106 | strategies = [FixedStrategy(2)] 107 | if FixedStrategy(3).partition_number(self.dataframe.columns[:-1]) <= max_partitions: 108 | strategies.append(FixedStrategy(3)) 109 | else: 110 | strategies = [] 111 | base_partitions = max_partitions 112 | 113 | for n in [2, 3, 5, 10]: 114 | for th in [0.99, 0.75, 0.67, 0.5, 0.3]: 115 | strategy = AdaptiveStrategy(self.ranked, [(th, n)]) 116 | if strategy.partition_number(self.dataframe.columns[:-1]) < base_partitions and \ 117 | not self.__contains(strategies, strategy): 118 | strategies.append(strategy) 119 | 120 | for (a, b) in [(0.33, 0.67), (0.25, 0.75), (0.1, 0.9)]: 121 | strategy = AdaptiveStrategy(self.ranked, [(a, 2), (b, 3)]) 122 | if strategy.partition_number(self.dataframe.columns[:-1]) < base_partitions and \ 123 | not self.__contains(strategies, strategy): 124 | strategies.append(strategy) 125 | 126 | avg = 0. 127 | for strategy in strategies: 128 | avg += strategy.partition_number(self.dataframe.columns[:-1]) 129 | avg /= len(strategies) 130 | 131 | params = [] 132 | for strategy in strategies: 133 | params += self._search_depth(strategy, 134 | strategy.partition_number(self.dataframe.columns[:-1]) > avg, 135 | base_partitions) 136 | self.params = params 137 | 138 | def _print_params(self, name, params): 139 | print("**********************") 140 | print(f"Best {name}") 141 | print("**********************") 142 | print(f"Error = {params[0]:.2f}, {params[1]} rules") 143 | print(f"Threshold = {params[2]:.2f}") 144 | print(f"Iterations = {params[3].iterations}") 145 | print(f"Strategy = {params[3].strategy}") 146 | -------------------------------------------------------------------------------- /psyke/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from math import log10 3 | from random import Random 4 | 5 | _DEFAULT_RANDOM_SEED: int = 123 6 | 7 | ONNX_EXTENSION: str = '.onnx' 8 | 9 | _random_options = dict(_deterministic_mode=True, _default_random_seed=_DEFAULT_RANDOM_SEED) 10 | 11 | _random_seed_generator: Random = Random(_DEFAULT_RANDOM_SEED) 12 | 13 | _DEFAULT_PRECISION: float = 1e-6 14 | 15 | _precision_options: dict = {'precision': _DEFAULT_PRECISION} 16 | 17 | 18 | class TypeNotAllowedException(Exception): 19 | 20 | def __init__(self, type_name: str): 21 | super().__init__('Type "' + type_name + '" not allowed for discretization.') 22 | 23 | 24 | class Range: 25 | def __init__(self, mean: float, std: float): 26 | self.mean = mean 27 | self.std = std 28 | self.lower = mean 29 | self.upper = mean 30 | 31 | def left_infinite(self): 32 | self.lower = float('-inf') 33 | 34 | def right_infinite(self): 35 | self.upper = float('inf') 36 | 37 | def expand_left(self): 38 | self.lower -= self.std 39 | 40 | def expand_right(self): 41 | self.upper += self.std 42 | 43 | 44 | def is_deterministic_mode(): 45 | return _random_options['_deterministic_mode'] 46 | 47 | 48 | def set_deterministic_mode(value: bool): 49 | _random_options['_deterministic_mode'] = value 50 | 51 | 52 | def get_default_random_seed(): 53 | if is_deterministic_mode(): 54 | return _random_options['_default_random_seed'] 55 | else: 56 | return _random_seed_generator.randint(0, 1 << 64) 57 | 58 | 59 | def set_default_random_seed(value: int): 60 | _random_options['_default_random_seed'] = value 61 | 62 | 63 | def get_default_precision() -> float: 64 | return _precision_options['precision'] 65 | 66 | 67 | def get_int_precision() -> int: 68 | return -1 * int(log10(get_default_precision())) 69 | 70 | 71 | def set_default_precision(value: float): 72 | _precision_options['precision'] = value 73 | 74 | 75 | class Target(Enum): 76 | CLASSIFICATION = 1, 77 | CONSTANT = 2, 78 | REGRESSION = 3 79 | -------------------------------------------------------------------------------- /psyke/utils/dataframe.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | from hashlib import sha256 5 | from typing import Iterable, List 6 | import pandas as pd 7 | from pandas.core.util.hashing import hash_pandas_object 8 | from pandas.api.types import is_string_dtype, is_numeric_dtype, is_integer_dtype 9 | from sklearn.preprocessing import StandardScaler 10 | from sympy.core.containers import OrderedSet 11 | 12 | from psyke import DiscreteFeature 13 | from psyke.schema import LessThan, GreaterThan, Between, Value, Constant 14 | from psyke.utils import TypeNotAllowedException, Range 15 | 16 | 17 | def split_features(dataframe: pd.DataFrame) -> Iterable[DiscreteFeature]: 18 | result = [] 19 | features = {'V' + str(index + 1): column for index, column in enumerate(dataframe.columns)} 20 | for feature, column in features.items(): 21 | values = set(dataframe[column]) 22 | result.append(DiscreteFeature(feature, {feature + '_' + str(i): v for i, v in enumerate(values)})) 23 | return result 24 | 25 | 26 | def get_discrete_features_supervised(dataframe: pd.DataFrame) -> Iterable[DiscreteFeature]: 27 | result = OrderedSet() 28 | for feature in dataframe.columns[:-1]: 29 | result.add(DiscreteFeature(feature, create_set(feature, dataframe))) 30 | return result 31 | 32 | 33 | def create_set(feature: str, dataframe: pd.DataFrame) -> dict[str, Value]: 34 | if is_string_dtype(dataframe[feature]) or is_integer_dtype(dataframe[feature]): 35 | values = dataframe[feature].unique() 36 | elif is_numeric_dtype(dataframe[feature]): 37 | values = create_ranges(feature, dataframe) 38 | else: 39 | raise TypeNotAllowedException(dataframe[feature].dtype) 40 | return {"{}_{}".format(feature, i): create_original_value(v) for (i, v) in enumerate(values)} 41 | 42 | 43 | def create_original_value(value: Range | str | int) -> Value: 44 | if isinstance(value, Range): 45 | if value.lower == float('-inf'): 46 | return LessThan(value.upper) 47 | if value.upper == float('inf'): 48 | return GreaterThan(value.lower) 49 | return Between(value.lower, value.upper) 50 | return Constant(value) 51 | 52 | 53 | def create_ranges(feature: str, dataframe: pd.DataFrame) -> Iterable[Range]: 54 | ranges = init_ranges(feature, dataframe) 55 | expand_ranges(ranges) 56 | ranges[0].left_infinite() 57 | ranges[-1].right_infinite() 58 | return ranges 59 | 60 | 61 | def expand_ranges(ranges: Iterable[Range]): 62 | for r1, r2 in zip(ranges[0:-1], ranges[1:]): 63 | while r1.upper < r2.lower: 64 | r1.expand_right() 65 | r2.expand_left() 66 | mean = ((r1.upper - r1.std + r2.lower + r2.std) / 2) 67 | r1.upper = mean 68 | r2.lower = mean 69 | 70 | 71 | def init_ranges(feature: str, dataframe: pd.DataFrame) -> Iterable[Range]: 72 | desc = [dataframe[dataframe.iloc[:, -1] == v].describe()[feature] for v in dataframe.iloc[:, -1].unique()] 73 | desc = [(d['mean'], d['std']) for d in desc] 74 | desc.sort() 75 | return [Range(d[0], d[1]) for d in desc] 76 | 77 | 78 | def get_discrete_features_equal_frequency( 79 | dataframe: pd.DataFrame, 80 | bins: int = None, 81 | output: bool = True, 82 | bin_names: List[str] = [] 83 | ) -> Iterable[DiscreteFeature]: 84 | features = dataframe.columns[:-1] if output else dataframe.columns 85 | result = set() 86 | if bins is None: 87 | if len(bin_names) > 0: 88 | bins = len(bin_names) 89 | else: 90 | raise ValueError("No bins nor bin_names have been provided") 91 | elif bins > 0: 92 | if len(bin_names) == 0: 93 | bin_names = range(0, bins) 94 | elif len(bin_names) == bins: 95 | pass 96 | else: 97 | raise ValueError("Mismatch among the provided amount of bins and the bin_names") 98 | else: 99 | raise ValueError("Negative amount of bins makes no sense") 100 | for feature in features: 101 | values = sorted(dataframe[feature]) 102 | intervals = [values[i * math.ceil(len(values) / bins)] for i in range(1, bins)] 103 | starting_interval: list[Value] = [LessThan(intervals[0])] 104 | ending_interval: list[Value] = [GreaterThan(intervals[-1])] 105 | middle_intervals: list[Value] = [Between(intervals[i], intervals[i + 1]) for i in range(0, len(intervals) - 1)] 106 | new_intervals = starting_interval + middle_intervals + ending_interval 107 | new_feature_names = [feature + '_' + str(i) for i in range(0, bins)] 108 | new_features = {new_feature_names[i]: new_intervals[i] for i in range(0, bins)} 109 | result.add(DiscreteFeature(feature, new_features)) 110 | return result 111 | 112 | 113 | def get_discrete_dataset(dataset: pd.DataFrame, discrete_features: Iterable[DiscreteFeature], 114 | sort: bool = True) -> pd.DataFrame: 115 | """ 116 | Create a new dataset mapping the old features into the new discrete features. 117 | Note: some algorithms require the same SORTED feature to be replicable due to rule optimization and other stuffs. 118 | Therefore the new features are alphabetically sorted. 119 | This is not strictly necessary because internally those algorithms perform the sorting themself. 120 | However it is a good idea to have this same function returning the same result w.r.t. the inputs. 121 | 122 | :param dataset: the original dataset 123 | :param discrete_features: mapping for the features 124 | :param sort: alphabetically sort new features 125 | :return: the new discrete dataset 126 | """ 127 | columns_name = [key for feature in discrete_features for key, _ in feature.admissible_values.items()] 128 | if sort: 129 | columns_name = sorted(columns_name) 130 | new_dataset = pd.DataFrame(columns=columns_name) 131 | for feature in discrete_features: 132 | for index, value in enumerate(dataset[feature.name]): 133 | for key, admissible_value in feature.admissible_values.items(): 134 | new_dataset.loc[index, key] = int(admissible_value.is_in(value)) 135 | 136 | for feature in discrete_features: 137 | for new_feature in feature.admissible_values.keys(): 138 | new_dataset[new_feature] = new_dataset[new_feature].astype(str).astype(int) 139 | 140 | return new_dataset 141 | 142 | 143 | def get_scaled_dataset(dataset: pd.DataFrame) -> tuple[pd.DataFrame, dict[str, tuple[float, float]]]: 144 | scaler = StandardScaler() 145 | scaler.fit(dataset) 146 | normalization = {key: (m, s) for key, m, s in zip(dataset.columns, scaler.mean_, scaler.scale_)} 147 | return pd.DataFrame(scaler.transform(dataset), columns=dataset.columns, index=dataset.index), normalization 148 | 149 | 150 | def scale_dataset(dataset: pd.DataFrame, normalization: dict[str, tuple[float, float]]) -> pd.DataFrame: 151 | new_data = pd.DataFrame() 152 | for column in dataset.columns: 153 | m, s = normalization[column] 154 | new_data[column] = (dataset[column] - m) / s 155 | return new_data 156 | 157 | 158 | class HashableDataFrame(pd.DataFrame): 159 | def __init__(self, obj): 160 | super().__init__(obj) 161 | 162 | def __hash__(self): 163 | hash_value = sha256(hash_pandas_object(self, index=True).values) 164 | hash_value = hash(hash_value.hexdigest()) 165 | return hash_value 166 | 167 | def __eq__(self, other): 168 | return self.equals(other) 169 | -------------------------------------------------------------------------------- /psyke/utils/metrics.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, f1_score 4 | 5 | 6 | def mae(expected, predicted): 7 | """ 8 | Calculates the predictions' MAE w.r.t. the instances given as input. 9 | 10 | :param expected: the expected data . 11 | :param predicted: the predicted data. 12 | :return: the mean absolute error (MAE) of the predictions. 13 | """ 14 | return score(expected, predicted, mean_absolute_error) 15 | 16 | 17 | def mse(expected, predicted): 18 | """ 19 | Calculates the predictions' MSE w.r.t. the instances given as input. 20 | 21 | :param expected: the expected data . 22 | :param predicted: the predicted data. 23 | :return: the mean squared error (MSE) of the predictions. 24 | """ 25 | return score(expected, predicted, mean_squared_error) 26 | 27 | 28 | def r2(expected, predicted): 29 | """ 30 | Calculates the predictions' R2 w.r.t. the instances given as input. 31 | 32 | :param expected: the expected data . 33 | :param predicted: the predicted data. 34 | :return: the R2 score of the predictions. 35 | """ 36 | return score(expected, predicted, r2_score) 37 | 38 | 39 | def accuracy(expected, predicted): 40 | """ 41 | Calculates the predictions' classification accuracy w.r.t. the instances given as input. 42 | 43 | :param expected: the expected data . 44 | :param predicted: the predicted data. 45 | :return: the classification accuracy of the predictions. 46 | """ 47 | return score(expected, predicted, accuracy_score) 48 | 49 | 50 | def f1(expected, predicted): 51 | """ 52 | Calculates the predictions' F1 score w.r.t. the instances given as input. 53 | 54 | :param expected: the expected data . 55 | :param predicted: the predicted data. 56 | :return: the F1 score of the predictions. 57 | """ 58 | return score(expected, predicted, partial(f1_score, average='weighted')) 59 | 60 | 61 | def score(expected, predicted, scoring_function): 62 | """ 63 | Calculates the predictions' score w.r.t. the instances given as input with the provided scoring function. 64 | 65 | :param expected: the expected data . 66 | :param predicted: the predicted data. 67 | :param scoring_function: the scoring function to be used. 68 | :return: the score of the predictions. 69 | """ 70 | idx = [prediction is not None for prediction in predicted] 71 | return scoring_function(expected[idx], predicted[idx]) 72 | -------------------------------------------------------------------------------- /psyke/utils/plot.py: -------------------------------------------------------------------------------- 1 | from array import array 2 | from typing import Callable, Iterable 3 | import numpy as np 4 | import pandas as pd 5 | from matplotlib import colors 6 | import matplotlib.pyplot as plt 7 | from matplotlib.lines import Line2D 8 | from tuprolog.solve.prolog import prolog_solver 9 | from tuprolog.theory import Theory, mutable_theory 10 | 11 | from psyke.extraction.hypercubic import HyperCubeExtractor 12 | from psyke.utils.logic import data_to_struct, get_in_rule, get_not_in_rule 13 | 14 | import matplotlib 15 | #matplotlib.use('TkAgg') 16 | 17 | 18 | def plot_init(xlim, ylim, xlabel, ylabel, size=(4, 3), equal=False): 19 | plt.figure(figsize=size) 20 | if equal: 21 | plt.gca().set_aspect(1) 22 | plt.xlim(xlim) 23 | plt.ylim(ylim) 24 | plt.gca().set_xlabel(xlabel) 25 | plt.gca().set_ylabel(ylabel) 26 | plt.gca().set_rasterized(True) 27 | 28 | 29 | def plot_point(x, y, color, marker, ec=None): 30 | plt.scatter(x, y, c=color, marker=marker, edgecolors=ec, linewidths=0.6) 31 | 32 | 33 | def plot_classification_samples(dataframe, classes, colors, markers, labels, loc, name, show=True, ec=None): 34 | marks = [Line2D([0], [0], color=c, marker=m, lw="0") for c, m in zip(colors, markers)] 35 | 36 | for cl, c, m in zip(classes, colors, markers): 37 | df = dataframe[dataframe.target == cl] 38 | plot_point(df["petal length"], df["petal width"], c, m, ec=ec) 39 | 40 | plt.gca().legend(marks, labels, loc=loc) 41 | plt.savefig("plot/{}.pdf".format(name), dpi=500, bbox_inches='tight') 42 | if show: 43 | plt.show() 44 | 45 | 46 | def plot_boundaries(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str], 47 | a: float = .5, h: str = '////////', ls='-', e=.05, fc='none', ec=None, reverse=False): 48 | cubes = extractor._hypercubes.copy() 49 | if reverse: 50 | cubes.reverse() 51 | for cube in cubes: 52 | plt.gca().fill_between((cube[x][0] - e, cube[x][1] + e), cube[y][0] - e, cube[y][1] + e, 53 | fc=colors[cube.output] if fc is None else fc, 54 | ec=colors[cube.output] if ec is None else ec, alpha=a, hatch=h, linestyle=ls) 55 | 56 | 57 | def plot_surfaces(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str], ec='r', e=.05): 58 | for cube in extractor._hypercubes: 59 | plt.gca().fill_between((cube[x][0] - e, cube[x][1] + e), cube[y][0] - e, cube[y][1] + e, 60 | fc='none', ec=ec) 61 | 62 | 63 | def plot_perimeters(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str], n: int = 5, 64 | ec: str = 'r', m: str = '*', s: int = 60, z: float = 1e10, lw: float = 0.8): 65 | for cube in extractor._hypercubes: 66 | for corner in cube.perimeter_samples(n): 67 | plt.scatter(corner[x], corner[y], c=colors[cube.output], marker=m, edgecolor=ec, s=s, zorder=z, linewidth=lw) 68 | 69 | 70 | def plot_centers(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str], 71 | ec: str = 'r', m: str = '*', s: int = 60, z: float = 1e10, lw: float = 0.8): 72 | for cube in extractor._hypercubes: 73 | center = cube.center 74 | plt.scatter(center[x], center[y], c=colors[cube.output], marker=m, edgecolor=ec, s=s, zorder=z, linewidth=lw) 75 | 76 | 77 | def plot_corners(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str], 78 | ec: str = 'r', m: str = '*', s: int = 60, z: float = 1e10, lw: float = 0.8): 79 | for cube in extractor._hypercubes: 80 | for corner in cube.corners(): 81 | plt.scatter(corner[x], corner[y], c=colors[cube.output], marker=m, edgecolor=ec, s=s, zorder=z, linewidth=lw) 82 | 83 | 84 | def plot_barycenters(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str], 85 | ec: str = 'r', m: str = '*', s: int = 60, z: float = 1e10, lw: float = 0.8): 86 | for cube in extractor._hypercubes: 87 | center = cube.barycenter 88 | plt.scatter(center[x], center[y], c=colors[cube.output], marker=m, edgecolor=ec, s=s, zorder=z, linewidth=lw) 89 | 90 | 91 | def predict_from_theory(theory: Theory, data: pd.DataFrame) -> list[float or str]: 92 | solver = prolog_solver(static_kb=mutable_theory(theory).assertZ(get_in_rule()).assertZ(get_not_in_rule())) 93 | index = data.shape[1] - 1 94 | y_element = data.iloc[0, -1] 95 | cast: Callable = lambda x: (str(x) if isinstance(y_element, str) else x) 96 | substitutions = [solver.solveOnce(data_to_struct(data)) for _, data in data.iterrows()] 97 | return [cast(query.solved_query.get_arg_at(index)) if query.is_yes else -1 for query in substitutions] 98 | 99 | 100 | def plot_theory(theory: Theory, data: pd.DataFrame = None, output: str = 'plot.pdf', azimuth: float = 45, 101 | distance: float = 9, elevation: float = 5, show_theory: bool = True, features: Iterable[str] = None) -> None: 102 | # Check if the number of common variables in clauses is less or equal to three. 103 | # If not raise an exception. 104 | fresh_theory = mutable_theory(theory) 105 | clauses = fresh_theory.clauses 106 | variables = sorted(list(set(arg.args[0].name.split('_')[0] for clause in clauses if clause.body_size > 0 and clause.body.is_recursive for arg in clause.body.unfolded)), reverse=True) 107 | if len(variables) > 3: 108 | raise Exception("Theory contains too many different features in the body of clauses, maximum is 3.") 109 | # If data is None, then create synthetic data covering a good portion of the variables space. 110 | # Just skip for now. 111 | if data is None: 112 | raise Exception("Method without data is not implemented yet") 113 | 114 | # Prepare data 115 | ys = predict_from_theory(fresh_theory, data) 116 | xs = data[variables].values.tolist() 117 | for i in range(len(ys)): 118 | xs[i].append(ys[i]) 119 | 120 | # Prepare colors 121 | if isinstance(ys[0], str): 122 | np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning) 123 | 124 | class ColorGenerator: 125 | 126 | def __init__(self): 127 | self.color_list = ['red', 'royalblue', 'green', 'orange', 'pink', 'acqua', 'grey'] 128 | self.counter = 0 129 | 130 | def get_new_color(self) -> str: 131 | self.counter += 1 132 | if self.counter > len(self.color_list): 133 | raise Exception("Classes exceed the maximum supported number (7)") 134 | return self.color_list[self.counter - 1] 135 | 136 | classes = set(ys) 137 | generator = ColorGenerator() 138 | class_color = {c: generator.get_new_color() for c in classes} 139 | get_color: Callable = lambda c: class_color[c] 140 | else: 141 | def color_fader(v: float = 0., c1: str = 'green', c2: str = 'red'): 142 | c1 = array(colors.to_rgb(c1)) 143 | c2 = array(colors.to_rgb(c2)) 144 | return colors.to_hex((1 - v) * c1 + v * c2) 145 | min_value = min(ys) 146 | max_value = max(ys) 147 | get_normalized_value: Callable = lambda v: (v - min_value)/(max_value - min_value) 148 | get_color: Callable = lambda c: color_fader(get_normalized_value(c)) 149 | 150 | fig = plt.figure() 151 | fig.set_size_inches(10, 10) 152 | if len(variables) == 3: 153 | ax = fig.add_subplot(projection='3d') 154 | else: 155 | ax = fig.add_subplot() 156 | 157 | for x in xs: 158 | ax.scatter(*x[:-1], c=get_color(x[-1]), s=14) 159 | 160 | ax.set_xlabel(variables[0], fontsize=18) 161 | ax.set_ylabel(variables[1], fontsize=18) 162 | if len(variables) == 3: 163 | ax.set_zlabel(variables[2], fontsize=18) 164 | 165 | ax.azim = azimuth 166 | ax.dist = distance 167 | ax.elev = elevation 168 | ax.set_title('Predictions according to Prolog theory', fontsize=24) 169 | if show_theory: 170 | pass 171 | # ax.text2D(0., 0.88, pretty_theory(theory, new_line=False), transform=ax.transAxes, fontsize=8) 172 | if isinstance(ys[0], str): 173 | custom_lines = [Line2D([0], [0], marker='o', markerfacecolor=get_color(c), 174 | markersize=20, color='w') for c in classes] 175 | ax.legend(custom_lines, classes, loc='upper left', numpoints=1, ncol=3, fontsize=18, bbox_to_anchor=(0, 0)) 176 | plt.savefig(output, format='pdf') 177 | -------------------------------------------------------------------------------- /psyke/utils/sorted.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Any 2 | 3 | 4 | class SortedList(list): 5 | 6 | def __init__(self, comparator: Callable[[Any, Any], int]): 7 | super().__init__() 8 | self.comparator = comparator 9 | 10 | def add(self, item) -> None: 11 | if len(self) == 0: 12 | self.insert(0, item) 13 | else: 14 | starting_len = len(self) 15 | for index, element in enumerate(self): 16 | if self.comparator(element, item) > 0: 17 | self.insert(index, item) 18 | break 19 | if len(self) == starting_len: 20 | self.append(item) 21 | 22 | def add_all(self, other) -> None: 23 | for item in other: 24 | self.add(item) 25 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": [ 3 | "config:base", 4 | ":rebaseStalePrs", 5 | ":semanticCommits", 6 | "docker:disable" 7 | ], 8 | "assignees": [ 9 | "MatteoMagnini" 10 | ], 11 | "automerge": true, 12 | "dependencyDashboard": true, 13 | "git-submodules": { 14 | "enabled": true 15 | }, 16 | "includeForks": true, 17 | "packageRules": [ 18 | { 19 | "description": "Updates to GitHub Actions should be tagged as 'ci'", 20 | "matchPaths": [ 21 | ".github/workflows/*.yml", 22 | ".github/workflows/*.yaml" 23 | ], 24 | "semanticCommitType": "ci" 25 | }, 26 | { 27 | "matchPackageNames": ["net.sourceforge.plantuml:plantuml"], 28 | "allowedVersions": "/^1\\./" 29 | } 30 | ], 31 | "prConcurrentLimit": 25, 32 | "prHourlyLimit": 0, 33 | "separateMajorMinor": true, 34 | "separateMinorPatch": true, 35 | "separateMultipleMajor": true 36 | } 37 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | build==1.2.2.post1 2 | twine==6.1.0 3 | numpy==1.26.4 4 | pandas==2.3.0 5 | scikit-learn==1.6.1 6 | 2ppy==0.4.1 7 | skl2onnx==1.18.0 8 | onnxruntime==1.19.2 9 | tensorflow==2.16.2 10 | parameterized==0.9.0 11 | protobuf==4.25.8 12 | setuptools==80.9.0 13 | kneed==0.8.5 14 | sympy==1.14.0 15 | matplotlib==3.9.4 16 | joblib==1.5.1 17 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import pathlib 3 | import subprocess 4 | import distutils.cmd 5 | 6 | here = pathlib.Path(__file__).parent.resolve() 7 | 8 | version_file = here / 'VERSION' 9 | 10 | # Get the long description from the README file 11 | long_description = (here / 'README.md').read_text(encoding='utf-8') 12 | 13 | 14 | EPOCHS: int = 50 15 | BATCH_SIZE: int = 16 16 | REQUIREMENTS = [ 17 | 'numpy~=1.26.0', 18 | 'pandas~=2.3.0', 19 | 'scikit-learn~=1.6.0', 20 | '2ppy~=0.4.0', 21 | 'kneed~=0.8.1', 22 | 'sympy~=1.11' 23 | ] # Optional 24 | 25 | 26 | def format_git_describe_version(version): 27 | if '-' in version: 28 | splitted = version.split('-') 29 | tag = splitted[0] 30 | index = f"dev{splitted[1]}" 31 | return f"{tag}.{index}" 32 | else: 33 | return version 34 | 35 | 36 | def get_version_from_git(): 37 | try: 38 | process = subprocess.run(["git", "describe"], cwd=str(here), check=True, capture_output=True) 39 | version = process.stdout.decode('utf-8').strip() 40 | version = format_git_describe_version(version) 41 | with version_file.open('w') as f: 42 | f.write(version) 43 | return version 44 | except subprocess.CalledProcessError: 45 | if version_file.exists(): 46 | return version_file.read_text().strip() 47 | else: 48 | return '0.1.0.archeo' 49 | 50 | 51 | version = get_version_from_git() 52 | 53 | 54 | print(f"Detected version {version} from git describe") 55 | 56 | 57 | class GetVersionCommand(distutils.cmd.Command): 58 | """A custom command to get the current project version inferred from git describe.""" 59 | 60 | description = 'gets the project version from git describe' 61 | user_options = [] 62 | 63 | def initialize_options(self): 64 | pass 65 | 66 | def finalize_options(self): 67 | pass 68 | 69 | def run(self): 70 | print(version) 71 | 72 | 73 | #class CreateTestPredictors(distutils.cmd.Command): 74 | # description = 'gets the project version from git describe' 75 | # user_options = [] 76 | 77 | # def initialize_options(self): 78 | # pass 79 | 80 | # def finalize_options(self): 81 | # pass 82 | 83 | # def run(self): 84 | # from psyke.utils import get_default_random_seed 85 | # from psyke.utils.dataframe import get_discrete_dataset 86 | # from sklearn.model_selection import train_test_split 87 | # from test import REQUIRED_PREDICTORS, get_dataset, get_model, get_schema 88 | # from test.resources.predictors import get_predictor_path, PATH, create_predictor_name 89 | # import ast 90 | # import pandas as pd 91 | # from tensorflow.keras import Model 92 | # from test import Predictor 93 | 94 | # Read the required predictors to run the tests: 95 | # model | model_options | dataset 96 | # required_predictors = pd.read_csv(REQUIRED_PREDICTORS, sep=';') 97 | 98 | # Create missing predictors. 99 | # model | model_options | dataset 100 | # for index, row in required_predictors.iterrows(): 101 | # options = ast.literal_eval(row['model_options']) 102 | # file_name = create_predictor_name(row['dataset'], row['model'], options) 103 | # if not get_predictor_path(file_name).is_file(): 104 | # dataset = get_dataset(row['dataset']) 105 | # if row['bins'] > 0: 106 | # schema = get_schema(dataset) # int(row['bins']) 107 | # dataset = get_discrete_dataset(dataset.iloc[:, :-1], schema).join(dataset.iloc[:, -1]) 108 | # model, _ = get_model(row['model'], options) 109 | # training_set, test_set = train_test_split(dataset, test_size=0.5, 110 | # random_state=get_default_random_seed()) 111 | # if isinstance(model, Model): 112 | # keys = set(training_set.iloc[:, -1]) 113 | # mapping = {key: i for i, key in enumerate(keys)} 114 | # training_set.iloc[:, -1] = training_set.iloc[:, -1].apply(lambda x: mapping[x]) 115 | # test_set.iloc[:, -1] = test_set.iloc[:, -1].apply(lambda x: mapping[x]) 116 | # model.fit(training_set.iloc[:, :-1], training_set.iloc[:, -1], epochs=EPOCHS, batch_size=BATCH_SIZE) 117 | # else: 118 | # model.fit(training_set.iloc[:, :-1], training_set.iloc[:, -1]) 119 | # predictor = Predictor(model) 120 | # predictor.save_to_onnx(PATH / file_name, Predictor.get_initial_types(training_set.iloc[:, :-1])) 121 | 122 | # required_predictors.to_csv(REQUIRED_PREDICTORS, sep=';', index=False) 123 | 124 | # print("Done") 125 | 126 | 127 | class CreateTheoryPlot(distutils.cmd.Command): 128 | description = 'create a plot representing samples X and their class/regression value Y predicted by a theory' 129 | user_options = [('theory=', 't', 'textual file of a Prolog theory'), 130 | ('dataset=', 'd', 'file of a dataset'), 131 | ('azimuth=', 'a', 'azimuth of the plot'), 132 | ('distance=', 'D', 'distance from the plot'), 133 | ('elevation=', 'e', 'elevation of the plot'), 134 | ('output=', 'o', 'output file name of the plot'), 135 | ('show=', 's', 'show theory in the plot ([y]/n)'), 136 | ] 137 | default_output_file_name = 'dummy/plot' 138 | default_theory_name = 'dummy/iris-theory' 139 | default_dataset_name = 'dummy/iris' 140 | default_azimuth = '45' 141 | default_distance = '9' 142 | default_elevation = '5' 143 | csv_format = '.csv' 144 | txt_format = '.txt' 145 | pdf_format = '.pdf' 146 | 147 | def initialize_options(self): 148 | self.output = self.default_output_file_name 149 | self.theory = self.default_theory_name 150 | self.dataset = self.default_dataset_name 151 | self.azimuth = self.default_azimuth 152 | self.elevation = self.default_elevation 153 | self.distance = self.default_distance 154 | self.show = True 155 | 156 | def finalize_options(self): 157 | self.theory_file = str(self.theory) 158 | self.data = str(self.dataset) 159 | self.output = str(self.output) 160 | self.a = float(self.azimuth) 161 | self.e = float(self.elevation) 162 | self.d = float(self.distance) 163 | self.s = self.show in (True, 'y', 'Y', 'yes', 'YES', 'Yes') 164 | 165 | def run(self): 166 | import pandas as pd 167 | from tuprolog.theory.parsing import parse_theory 168 | from psyke.utils.plot import plot_theory 169 | 170 | if self.theory_file is None or self.theory_file == '': 171 | raise Exception('Empty theory file name') 172 | if self.data is None or self.data == '': 173 | raise Exception('Empty dataset file name') 174 | with open(self.theory_file + (self.txt_format if '.' not in self.theory_file else ''), 'r') as file: 175 | textual_theory = file.read() 176 | theory = parse_theory(textual_theory) 177 | data = pd.read_csv(self.data + (self.csv_format if '.' not in self.data else '')) 178 | plot_theory(theory, data, self.output + self.pdf_format, self.a, self.d, self.e, self.s) 179 | 180 | 181 | setup( 182 | name='psyke', # Required 183 | version=version, 184 | description='Python-based implementation of PSyKE, i.e. a Platform for Symbolic Knowledge Extraction', 185 | license='Apache 2.0 License', 186 | long_description=long_description, 187 | long_description_content_type='text/markdown', 188 | url='https://github.com/psykei/psyke-python', 189 | author='Matteo Magnini', 190 | author_email='matteo.magnini@unibo.it', 191 | classifiers=[ 192 | 'Development Status :: 3 - Alpha', 193 | 'Intended Audience :: Developers', 194 | 'Topic :: Software Development :: Libraries', 195 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 196 | 'License :: OSI Approved :: Apache Software License', 197 | 'Programming Language :: Python :: 3', 198 | 'Programming Language :: Python :: 3.9', 199 | 'Programming Language :: Python :: 3 :: Only', 200 | 'Programming Language :: Prolog' 201 | ], 202 | keywords='knowledge extraction, symbolic ai, ske, extractor, rules, prolog', # Optional 203 | # package_dir={'': 'src'}, # Optional 204 | packages=find_packages('.'), # Required 205 | include_package_data=True, 206 | python_requires='>=3.9.0, <3.10', 207 | install_requires=REQUIREMENTS, # Optional 208 | zip_safe=False, 209 | platforms="Independant", 210 | project_urls={ # Optional 211 | 'Bug Reports': 'https://github.com/psykei/psyke-python/issues', 212 | # 'Funding': 'https://donate.pypi.org', 213 | # 'Say Thanks!': 'http://saythanks.io/to/example', 214 | 'Source': 'https://github.com/psykei/psyke-python', 215 | }, 216 | cmdclass={ 217 | 'get_project_version': GetVersionCommand, 218 | # 'create_test_predictors': CreateTestPredictors, 219 | 'create_theory_plot': CreateTheoryPlot 220 | }, 221 | ) 222 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | from typing import Iterable, Union 5 | import numpy as np 6 | import onnxruntime 7 | import pandas as pd 8 | from keras import Input, Model 9 | from keras.src.layers import Dense 10 | #from tensorflow.python.saved_model.save import save 11 | from tensorflow.saved_model import save 12 | from onnxconverter_common import FloatTensorType, Int64TensorType, StringTensorType, DataType 13 | #from skl2onnx import convert_sklearn 14 | from sklearn.ensemble import RandomForestRegressor 15 | from sklearn.neighbors import KNeighborsClassifier 16 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 17 | from tensorflow.random import set_seed 18 | import tensorflow as tf 19 | #from tensorflow.keras import Input, Model 20 | #from tensorflow.keras.layers import Dense 21 | from psyke.schema import DiscreteFeature, Value 22 | from psyke.utils import get_default_random_seed 23 | from sklearn.datasets import fetch_california_housing, load_iris 24 | from psyke import Extractor 25 | from psyke.utils.dataframe import get_discrete_features_supervised 26 | from test.resources.predictors import PATH, get_predictor_path 27 | 28 | REQUIRED_PREDICTORS: str = PATH / '.required.csv' 29 | LE = '=<' 30 | GE = '>=' 31 | L = '<' 32 | G = '>' 33 | 34 | 35 | def get_extractor(extractor_type: str, parameters: dict): 36 | if extractor_type.lower() == 'cart': 37 | return Extractor.cart(**parameters) 38 | elif extractor_type.lower() == 'iter': 39 | return Extractor.iter(**parameters) 40 | elif extractor_type.lower() == 'real': 41 | return Extractor.real(**parameters) 42 | elif extractor_type.lower() == 'trepan': 43 | return Extractor.trepan(**parameters) 44 | elif extractor_type.lower() == 'gridex': 45 | return Extractor.gridex(**parameters) 46 | else: 47 | raise NotImplementedError(extractor_type + ' not implemented yet.') 48 | 49 | 50 | def get_model(model_type: str, parameters: dict): 51 | if model_type.lower() == 'rfr': 52 | return RandomForestRegressor(**parameters, random_state=np.random.seed(get_default_random_seed())), False 53 | elif model_type.lower() == 'knnc': 54 | return KNeighborsClassifier(**parameters), False # It's deterministic, don't have a random_state 55 | elif model_type.lower() == 'dtc': 56 | return DecisionTreeClassifier(max_depth=3, random_state=np.random.seed(get_default_random_seed())), False 57 | elif model_type.lower() == 'dtr': 58 | return DecisionTreeRegressor(max_depth=3, random_state=np.random.seed(get_default_random_seed())), False 59 | elif model_type.lower() == 'nn': 60 | return get_simple_neural_network(**parameters, random_state=np.random.seed(get_default_random_seed())), False 61 | else: 62 | return Predictor.load_from_onnx(str(get_predictor_path(model_type))), True 63 | 64 | 65 | def get_simple_neural_network(input: int = 4, output: int = 3, layers: int = 3, neurons: int = 32, 66 | random_state: int = np.random.seed(get_default_random_seed())) -> Model: 67 | set_seed(random_state) 68 | input_layer = Input(input) 69 | x = input_layer 70 | for _ in range(layers-1): 71 | x = Dense(neurons, activation='relu')(x) 72 | x = Dense(output, activation='softmax')(x) 73 | model = Model(input_layer, x) 74 | model.compile('adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) 75 | return model 76 | 77 | 78 | def get_dataset(name: str): 79 | if name.lower() == 'house': 80 | x, y = fetch_california_housing(return_X_y=True, as_frame=True) 81 | normalized_x = _normalize_data(x) 82 | normalized_y = _normalize_data(y) 83 | return normalized_x.join(normalized_y) 84 | elif name.lower() == 'iris': 85 | x, y = load_iris(return_X_y=True, as_frame=True) 86 | y = pd.DataFrame(y).replace({"target": {0: 'setosa', 1: 'versicolor', 2: 'virginica'}}) 87 | result = x.join(y) 88 | result.columns = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'iris'] 89 | return result 90 | else: 91 | raise Exception('unknown dataset name.') 92 | 93 | 94 | def _normalize_data(x: pd.DataFrame) -> pd.DataFrame: 95 | return (x - x.min()) / (x.max() - x.min()) 96 | 97 | 98 | def get_schema(dataset: pd.DataFrame) -> Union[Iterable[DiscreteFeature], None]: 99 | return get_discrete_features_supervised(dataset) 100 | # return SCHEMAS[filename] if filename in SCHEMAS.keys() else None 101 | 102 | 103 | def _get_admissible_values(prepositions: Iterable[str]) -> dict[str, Value]: 104 | raise NotImplementedError('Automatic schema reading not implemented yet.') 105 | 106 | 107 | class Predictor: 108 | 109 | def __init__(self, model, from_file_onnx=False): 110 | self._model = model 111 | self._from_file_onnx = from_file_onnx 112 | 113 | @staticmethod 114 | def load_from_onnx(file: str) -> Predictor: 115 | return Predictor(onnxruntime.InferenceSession(file), True) 116 | 117 | #def save_to_onnx(self, file, initial_types: list[tuple[str, DataType]]): 118 | # file = str(file) + '.onnx' 119 | # if not self._from_file_onnx: 120 | # if os.path.exists(file): 121 | # os.remove(file) 122 | # if isinstance(self._model, Model): 123 | # save(self._model, "tmp_model") 124 | # os.system("python -m tf2onnx.convert --saved-model tmp_model --output " + file) 125 | # else: 126 | # onnx_predictor = convert_sklearn(self._model, initial_types=initial_types) 127 | # with open(file, 'wb') as f: 128 | # f.write(onnx_predictor.SerializeToString()) 129 | 130 | def predict(self, dataset: pd.DataFrame | np.ndarray) -> Iterable: 131 | array = dataset.to_numpy() if isinstance(dataset, pd.DataFrame) else dataset 132 | if self._from_file_onnx: 133 | input_name = self._model.get_inputs()[0].name 134 | label_name = self._model.get_outputs()[0].name 135 | if array.dtype == 'float64': 136 | tensor_type = np.float32 137 | elif array.dtype == 'int64' or array.dtype == 'int32': 138 | tensor_type = np.int64 139 | else: 140 | tensor_type = np.str 141 | pred_onx = self._model.run([label_name], {input_name: array.astype(tensor_type)})[0] 142 | return [prediction for plist in pred_onx for prediction in plist] if isinstance(pred_onx[0], list) \ 143 | else [prediction for prediction in pred_onx] 144 | else: 145 | return self._model.predict(dataset) 146 | 147 | # TODO: to be improved, make it more flexible 148 | @staticmethod 149 | def get_initial_types(dataset: pd.DataFrame | np.ndarray) -> list[tuple[str, DataType]]: 150 | array = dataset.to_numpy() if isinstance(dataset, pd.DataFrame) else dataset 151 | name = '' 152 | for column in dataset.columns: 153 | name += column + ', ' 154 | name = name[:-2] 155 | shape = [None, array.shape[1]] 156 | if array.dtype == 'float64': 157 | types = FloatTensorType(shape) 158 | elif array.dtype == 'int64': 159 | types = Int64TensorType(shape) 160 | else: 161 | types = StringTensorType(shape) 162 | return [(name, types)] 163 | -------------------------------------------------------------------------------- /test/psyke/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from sklearn.model_selection import train_test_split 3 | from tuprolog.solve.prolog import prolog_solver 4 | from psyke.extraction.hypercubic import Grid, FeatureRanker 5 | from psyke.utils.dataframe import get_discrete_dataset 6 | from psyke.utils.logic import data_to_struct, get_in_rule, get_not_in_rule 7 | from psyke.extraction.hypercubic.strategy import AdaptiveStrategy, FixedStrategy 8 | from test import get_dataset, get_extractor, get_schema, get_model 9 | from test.resources.tests import test_cases 10 | from tuprolog.theory import Theory, mutable_theory 11 | from tuprolog.theory.parsing import parse_theory 12 | from typing import Callable 13 | import ast 14 | import numpy as np 15 | from psyke import get_default_random_seed 16 | 17 | 18 | def initialize(file: str) -> list[dict[str:Theory]]: 19 | for row in test_cases(file): 20 | params = dict() if row['extractor_params'] == '' else ast.literal_eval(row['extractor_params']) 21 | dataset = get_dataset(row['dataset']) 22 | 23 | training_set, test_set = train_test_split(dataset, test_size=0.05 if row['dataset'].lower() == 'house' else 0.5, 24 | random_state=get_default_random_seed()) 25 | 26 | schema, test_set_for_predictor = None, test_set 27 | if 'disc' in row.keys() and bool(row['disc']): 28 | schema = get_schema(training_set) 29 | params['discretization'] = schema 30 | training_set = get_discrete_dataset(training_set.iloc[:, :-1], schema) \ 31 | .join(training_set.iloc[:, -1].reset_index(drop=True)) 32 | test_set_for_predictor = get_discrete_dataset(test_set.iloc[:, :-1], schema) \ 33 | .join(test_set.iloc[:, -1].reset_index(drop=True)) 34 | 35 | # Handle Cart tests. 36 | # Cart needs to inspect the tree of the predictor. 37 | # Unfortunately onnx does not provide a method to do that. 38 | #if row['predictor'].lower() not in ['dtc', 'dtr']: 39 | # params['predictor'] = Predictor.load_from_onnx(str(get_predictor_path(row['predictor']))) 40 | #else: 41 | predictor, fitted = get_model(row['predictor'], {}) 42 | if not fitted: 43 | predictor.fit(training_set.iloc[:, :-1], training_set.iloc[:, -1]) 44 | params['predictor'] = predictor 45 | 46 | # Handle GridEx tests 47 | # TODO: this is algorithm specific therefore it should be handled inside the algorithm itself. 48 | if 'grid' in row.keys() and bool: 49 | strategy, n = eval(row['strategies']) 50 | if strategy == "F": 51 | params['grid'] = Grid(int(row['grid']), FixedStrategy(n)) 52 | else: 53 | ranked = FeatureRanker(training_set.columns[:-1]) \ 54 | .fit(params['predictor'], training_set.iloc[:, :-1]).rankings() 55 | params['grid'] = Grid(int(row['grid']), AdaptiveStrategy(ranked, n)) 56 | 57 | extractor = get_extractor(row['extractor_type'], params) 58 | theory = extractor.extract(training_set) 59 | 60 | # Compute predictions from rules 61 | index = test_set.shape[1] - 1 62 | 63 | cast, substitutions = get_substitutions(test_set, theory) 64 | expected = [cast(query.solved_query.get_arg_at(index)) for query in substitutions if query.is_yes] 65 | predictions = [prediction for prediction in extractor.predict(test_set_for_predictor.iloc[:, :-1]) 66 | if prediction is not None] 67 | 68 | yield { 69 | 'extractor': extractor, 70 | 'extracted_theory': theory, 71 | 'extracted_test_y_from_theory': np.array(expected), 72 | 'extracted_test_y_from_extractor': np.array(predictions), 73 | 'test_set': test_set, 74 | 'expected_theory': parse_theory(row['theory'] + '.') if row['theory'] != '' else None, 75 | 'discretization': schema 76 | } 77 | 78 | 79 | def get_substitutions(test_set, theory): 80 | cast: Callable = lambda x: (str(x) if isinstance(test_set.iloc[0, -1], str) else float(x.value)) 81 | solver = prolog_solver(static_kb=mutable_theory(theory).assertZ(get_in_rule()).assertZ(get_not_in_rule())) 82 | substitutions = [solver.solveOnce(data_to_struct(data)) for _, data in test_set.iterrows()] 83 | return cast, substitutions 84 | -------------------------------------------------------------------------------- /test/psyke/clustering/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/psykei/psyke-python/1e4ce34679d34abde6f7545a070aab99d3b053bb/test/psyke/clustering/__init__.py -------------------------------------------------------------------------------- /test/psyke/extraction/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /test/psyke/extraction/cart/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /test/psyke/extraction/cart/test_cart.py: -------------------------------------------------------------------------------- 1 | from parameterized import parameterized_class 2 | from psyke.utils import get_default_precision 3 | from psyke import logger 4 | from test.psyke import initialize 5 | import unittest 6 | 7 | """ 8 | TODO (?): right now there is a small chance that corner data are wrongly predicted (that is fine for now). 9 | In other words, if we use the extracted rules (with a specific default accuracy fo float) 10 | and compare their result with the one obtained by the actual decision tree (thresholds do not have truncated float) 11 | they may be different. To avoid this, when we will refactor all extractor we will also address this issue. 12 | """ 13 | 14 | 15 | @parameterized_class(initialize('cart')) 16 | class TestCart(unittest.TestCase): 17 | 18 | def test_extract(self): 19 | logger.info(self.expected_theory) 20 | logger.info(self.extracted_theory) 21 | self.assertTrue(self.expected_theory.equals(self.extracted_theory, False)) 22 | 23 | def test_predict(self): 24 | # self.assertEqual(self.extracted_test_y_from_theory, self.extracted_test_y_from_pruned_theory) 25 | if isinstance(self.extracted_test_y_from_theory[0], str): 26 | self.assertTrue(all(self.extracted_test_y_from_theory == self.extracted_test_y_from_extractor)) 27 | else: 28 | self.assertTrue(max(abs(self.extracted_test_y_from_theory - self.extracted_test_y_from_extractor)) < 29 | get_default_precision()) 30 | 31 | 32 | if __name__ == '__main__': 33 | unittest.main() 34 | -------------------------------------------------------------------------------- /test/psyke/extraction/cart/test_simplified_cart.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | import numpy as np 4 | from parameterized import parameterized_class 5 | from sklearn.model_selection import train_test_split 6 | from tuprolog.solve.prolog import prolog_solver 7 | from tuprolog.theory import mutable_theory 8 | 9 | from psyke import Extractor 10 | from psyke.utils import get_default_precision 11 | from psyke.utils.logic import data_to_struct, get_in_rule, get_not_in_rule 12 | from test import get_dataset, get_model 13 | import unittest 14 | 15 | 16 | # TODO: should be refactored using the a .csv file 17 | from test.psyke import get_substitutions 18 | 19 | 20 | @parameterized_class([{"dataset": "iris", "predictor": "DTC", "task": "extraction"}, 21 | {"dataset": "house", "predictor": "DTR", "task": "hypercubic"}]) 22 | class TestSimplifiedCart(unittest.TestCase): 23 | 24 | def test_equality(self): 25 | dataset = get_dataset(self.dataset) 26 | dataset = dataset.reindex(sorted(dataset.columns[:-1]) + [dataset.columns[-1]], axis=1) 27 | train, test = train_test_split(dataset, test_size=0.5) 28 | tree, _ = get_model(self.predictor, {}) 29 | tree.fit(train.iloc[:, :-1], train.iloc[:, -1]) 30 | extractor = Extractor.cart(tree, simplify=False) 31 | theory = extractor.extract(train) 32 | simplified_extractor = Extractor.cart(tree) 33 | simplified_theory = simplified_extractor.extract(train) 34 | 35 | index = test.shape[1] - 1 36 | cast, substitutions = get_substitutions(test, theory) 37 | expected = [cast(query.solved_query.get_arg_at(index)) for query in substitutions] 38 | 39 | cast, simplified_substitutions = get_substitutions(test, simplified_theory) 40 | simplified_expected = [cast(query.solved_query.get_arg_at(index)) for query in simplified_substitutions] 41 | 42 | if isinstance(test.iloc[0, -1], str): 43 | self.assertTrue(all(np.array(extractor.predict(test.iloc[:, :-1])) == 44 | np.array(simplified_extractor.predict(test.iloc[:, :-1])))) 45 | self.assertEqual(expected, simplified_expected) 46 | else: 47 | self.assertTrue(max(abs(np.array(extractor.predict(test.iloc[:, :-1])) - 48 | np.array(simplified_extractor.predict(test.iloc[:, :-1]))) 49 | ) < get_default_precision()) 50 | self.assertTrue(max(abs(np.array(expected) - np.array(simplified_expected))) < get_default_precision()) 51 | 52 | 53 | if __name__ == '__main__': 54 | unittest.main() 55 | -------------------------------------------------------------------------------- /test/psyke/extraction/hypercubic/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /test/psyke/extraction/hypercubic/gridex/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /test/psyke/extraction/hypercubic/gridex/test_gridex.py: -------------------------------------------------------------------------------- 1 | from psyke import logger 2 | from parameterized import parameterized_class 3 | from test.psyke import initialize 4 | import unittest 5 | 6 | 7 | @parameterized_class(initialize('gridex')) 8 | class TestGridEx(unittest.TestCase): 9 | 10 | def test_extract(self): 11 | logger.info(self.expected_theory) 12 | logger.info(self.extracted_theory) 13 | # This test does not pass the ci, however it is not clear to me why (local ok). Could it be non-deterministic? 14 | # self.assertTrue(self.expected_theory.equals(self.extracted_theory, False)) 15 | 16 | def test_predict(self): 17 | if isinstance(self.extracted_test_y_from_theory[0], str): 18 | self.assertTrue(all(self.extracted_test_y_from_theory == self.extracted_test_y_from_extractor)) 19 | else: 20 | # TODO: check this! 21 | self.assertTrue(max(abs(self.extracted_test_y_from_theory - self.extracted_test_y_from_extractor)) < 0.05) 22 | 23 | 24 | if __name__ == '__main__': 25 | unittest.main() 26 | -------------------------------------------------------------------------------- /test/psyke/extraction/hypercubic/iter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/psykei/psyke-python/1e4ce34679d34abde6f7545a070aab99d3b053bb/test/psyke/extraction/hypercubic/iter/__init__.py -------------------------------------------------------------------------------- /test/psyke/extraction/hypercubic/iter/test_iter.py: -------------------------------------------------------------------------------- 1 | from psyke import logger 2 | from parameterized import parameterized_class 3 | from psyke.utils import get_default_precision 4 | from test.psyke import initialize 5 | import unittest 6 | 7 | 8 | @parameterized_class(initialize('iter')) 9 | class TestIter(unittest.TestCase): 10 | 11 | def test_extract(self): 12 | logger.info(self.expected_theory) 13 | logger.info(self.extracted_theory) 14 | self.assertTrue(self.expected_theory.equals(self.extracted_theory, False)) 15 | 16 | def test_predict(self): 17 | if isinstance(self.extracted_test_y_from_theory[0], str): 18 | self.assertTrue(all(self.extracted_test_y_from_theory == self.extracted_test_y_from_extractor)) 19 | else: 20 | self.assertTrue(max(abs(self.extracted_test_y_from_theory - self.extracted_test_y_from_extractor)) < 21 | get_default_precision()) 22 | 23 | 24 | if __name__ == '__main__': 25 | unittest.main() 26 | -------------------------------------------------------------------------------- /test/psyke/extraction/real/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /test/psyke/extraction/real/test_real.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from parameterized import parameterized_class 3 | from psyke import logger 4 | from test.psyke import initialize 5 | 6 | 7 | @parameterized_class(initialize('real')) 8 | class TestReal(unittest.TestCase): 9 | 10 | def test_extract(self): 11 | logger.info(self.expected_theory) 12 | logger.info(self.extracted_theory) 13 | self.assertTrue(self.expected_theory.equals(self.extracted_theory, False)) 14 | 15 | def test_predict(self): 16 | self.assertTrue(all(self.extracted_test_y_from_theory == self.extracted_test_y_from_extractor)) 17 | 18 | 19 | if __name__ == '__main__': 20 | unittest.main() 21 | -------------------------------------------------------------------------------- /test/psyke/extraction/real/test_rule.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from psyke.extraction.real.utils import Rule 3 | from psyke.utils.dataframe import split_features 4 | from test import get_dataset 5 | 6 | 7 | class TestRule(unittest.TestCase): 8 | 9 | def test_subrule(self): 10 | pred_1, pred_2 = ['V1', 'V2'], ['V3', 'V4'] 11 | rule_1 = Rule(pred_1, pred_2) 12 | self.assertTrue(rule_1 in rule_1) 13 | rule_2 = Rule(pred_2, pred_1) 14 | self.assertFalse(rule_1 in rule_2) 15 | self.assertFalse(rule_2 in rule_1) 16 | rule_3 = Rule(['V1'], ['V3']) 17 | self.assertTrue(rule_1 in rule_3) 18 | self.assertFalse(rule_3 in rule_1) 19 | self.assertFalse(rule_2 in rule_3) 20 | self.assertFalse(rule_3 in rule_2) 21 | rule_4 = Rule(["V1"], ["V5"]) 22 | self.assertFalse(rule_1 in rule_4) 23 | self.assertFalse(rule_4 in rule_1) 24 | rule_5 = Rule(["V1", "V6"], ["V3", "V4"]) 25 | self.assertFalse(rule_1 in rule_5) 26 | self.assertFalse(rule_5 in rule_1) 27 | self.assertTrue(rule_1 in Rule([], [])) 28 | 29 | def test_reduce(self): 30 | dataset = get_dataset('iris') 31 | features = split_features(dataset) 32 | rule = Rule(["V1_1", "V2_2", "V3_0"], 33 | ["V1_0", "V2_1", "V2_0", "V4_1", "V4_2"]) 34 | reduced_rule = Rule(["V1_1", "V2_2", "V3_0"], 35 | ["V4_1", "V4_2"]) 36 | self.assertEqual(reduced_rule.true_predicates, rule.reduce(features).true_predicates) 37 | self.assertEqual(reduced_rule.false_predicates, rule.reduce(features).false_predicates) 38 | self.assertEqual(reduced_rule.true_predicates, reduced_rule.reduce(features).true_predicates) 39 | self.assertEqual(reduced_rule.false_predicates, reduced_rule.reduce(features).false_predicates) 40 | 41 | 42 | if __name__ == '__main__': 43 | unittest.main() -------------------------------------------------------------------------------- /test/psyke/extraction/trepan/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /test/psyke/extraction/trepan/test_node.py: -------------------------------------------------------------------------------- 1 | from psyke.extraction.trepan import Node 2 | from test import get_dataset 3 | import pandas as pd 4 | import unittest 5 | 6 | 7 | class TestNode(unittest.TestCase): 8 | 9 | dataset: pd.DataFrame = get_dataset('iris') 10 | n_examples = dataset.shape[0] 11 | all_node = Node(dataset, n_examples) 12 | setosa_40 = Node(dataset.iloc[10:70, :], n_examples) 13 | virginica_10 = Node(dataset.iloc[95:110, :], n_examples) 14 | versicolor_50 = Node(dataset.iloc[20:130, :], n_examples) 15 | 16 | def test_reach(self): 17 | node = Node(self.dataset, self.n_examples) 18 | self.assertEqual(node.reach, self.all_node.reach) 19 | self.assertTrue(self.virginica_10.reach < self.setosa_40.reach) 20 | self.assertTrue(self.setosa_40.reach < self.versicolor_50.reach) 21 | self.assertTrue(self.versicolor_50.reach < self.all_node.reach) 22 | 23 | def test_dominant(self): 24 | self.assertEqual('setosa', self.setosa_40.dominant) 25 | self.assertEqual('virginica', self.virginica_10.dominant) 26 | self.assertEqual('versicolor', self.versicolor_50.dominant) 27 | 28 | def test_correct(self): 29 | self.assertEqual(50, self.versicolor_50.correct) 30 | self.assertEqual(40, self.setosa_40.correct) 31 | self.assertEqual(10, self.virginica_10.correct) 32 | 33 | def test_fidelity(self): 34 | self.assertEqual(50 / 150, self.all_node.fidelity) 35 | self.assertEqual(40 / 60, self.setosa_40.fidelity) 36 | self.assertEqual(10 / 15, self.virginica_10.fidelity) 37 | self.assertEqual(50 / 110, self.versicolor_50.fidelity) 38 | 39 | def test_priority(self): 40 | self.assertTrue(self.all_node.priority < self.versicolor_50.priority) 41 | self.assertTrue(self.versicolor_50.priority < self.setosa_40.priority) 42 | self.assertTrue(self.setosa_40.priority < self.virginica_10.priority) 43 | 44 | def test_n_classes(self): 45 | self.assertEqual(3, self.all_node.n_classes) 46 | self.assertEqual(2, self.virginica_10.n_classes) 47 | self.assertEqual(2, self.setosa_40.n_classes) 48 | self.assertEqual(3, self.versicolor_50.n_classes) 49 | self.assertEqual(1, Node(self.dataset.iloc[15:40, :], self.n_examples).n_classes) 50 | 51 | def test_iterator(self): 52 | node = Node(self.dataset, self.n_examples) 53 | child_1 = Node(self.dataset.iloc[:50, :], self.n_examples) 54 | child_2 = Node(self.dataset.iloc[50:150, :], self.n_examples) 55 | node.children = [child_1, child_2] 56 | grandchild_1_1 = Node(self.dataset.iloc[:25, :], self.n_examples) 57 | grandchild_2_1 = Node(self.dataset.iloc[50:80, :], self.n_examples) 58 | grandchild_2_2 = Node(self.dataset.iloc[80:120, :], self.n_examples) 59 | child_1.children = [grandchild_1_1] 60 | child_2.children = [grandchild_2_1, grandchild_2_2] 61 | self.assertEqual(list(node), list(child_1) + list(child_2) + [node]) 62 | self.assertEqual([grandchild_1_1, child_1, grandchild_2_1, grandchild_2_2, child_2, node], list(node)) 63 | 64 | def test_to_string(self): 65 | node = Node(self.dataset, self.n_examples, (('V1', 0.0), ('V2', 1.0))) 66 | self.assertEqual(' = setosa', str(self.all_node)) 67 | self.assertEqual(' = setosa', str(self.setosa_40)) 68 | self.assertEqual(' = versicolor', str(self.versicolor_50)) 69 | self.assertEqual(' = virginica', str(self.virginica_10)) 70 | self.assertEqual('!V1, V2 = setosa', str(node)) 71 | 72 | 73 | if __name__ == '__main__': 74 | unittest.main() 75 | -------------------------------------------------------------------------------- /test/psyke/extraction/trepan/test_split.py: -------------------------------------------------------------------------------- 1 | from psyke.extraction.trepan import Node, Split 2 | from test import get_dataset 3 | import math 4 | import pandas as pd 5 | import unittest 6 | 7 | 8 | class TestSplit(unittest.TestCase): 9 | 10 | dataset: pd.DataFrame = get_dataset('iris') 11 | n_examples = dataset.shape[0] 12 | all_node = Node(dataset, n_examples) 13 | setosa_40 = Node(dataset.iloc[10:70, :], n_examples) 14 | setosa_40_complementar = Node(pd.concat([dataset.iloc[:10, :], dataset.iloc[70:, :]]), n_examples) 15 | versicolor_25 = Node(dataset.iloc[40:75, :], n_examples) 16 | versicolor_25_complementar = Node(dataset.iloc[75:110, :], n_examples) 17 | 18 | def test_priority(self): 19 | self.assertTrue(math.isclose(-40/60-50/90-100, 20 | Split(self.all_node, (self.setosa_40, self.setosa_40_complementar)).priority)) 21 | self.assertTrue(math.isclose((25 / 35) * - 2 - 200 + 200, 22 | Split(self.all_node, (self.versicolor_25, self.versicolor_25_complementar)) 23 | .priority)) 24 | 25 | 26 | if __name__ == '__main__': 27 | unittest.main() 28 | -------------------------------------------------------------------------------- /test/psyke/extraction/trepan/test_trepan.py: -------------------------------------------------------------------------------- 1 | from cmath import isclose 2 | from parameterized import parameterized_class 3 | from psyke import logger 4 | from psyke.utils import get_default_precision 5 | from psyke.utils.logic import pretty_theory 6 | from test.psyke import initialize 7 | import unittest 8 | 9 | 10 | @parameterized_class(initialize('trepan')) 11 | class TestTrepan(unittest.TestCase): 12 | 13 | def test_extract(self): 14 | logger.info(pretty_theory(self.expected_theory) + '\n') 15 | logger.info(pretty_theory(self.extracted_theory) + '\n') 16 | self.assertTrue(self.expected_theory.equals(self.extracted_theory, False)) 17 | 18 | def test_predict(self): 19 | if isinstance(self.extracted_test_y_from_theory[0], str): 20 | self.assertTrue(all(self.extracted_test_y_from_theory == self.extracted_test_y_from_extractor)) 21 | else: 22 | self.assertTrue(max(abs(self.extracted_test_y_from_theory - self.extracted_test_y_from_extractor)) < 23 | get_default_precision()) 24 | 25 | 26 | if __name__ == '__main__': 27 | unittest.main() 28 | -------------------------------------------------------------------------------- /test/psyke/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/psykei/psyke-python/1e4ce34679d34abde6f7545a070aab99d3b053bb/test/psyke/utils/__init__.py -------------------------------------------------------------------------------- /test/psyke/utils/test_prune.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from tuprolog.theory import mutable_theory, theory 3 | from tuprolog.theory.parsing import parse_theory 4 | from psyke.utils.logic import prune 5 | 6 | 7 | class TestPrune(unittest.TestCase): 8 | 9 | def test_prune_documentation(self): 10 | theory1 = "c(A, B, C, D, positive) :- ('=<'(A, 1), '>'(B, 2), '=='(C, 0)). " \ 11 | + "c(A, B, C, D, positive) :- ('=<'(A, 1), '>'(B, 2))." 12 | pruned1 = "c(A, B, C, D, positive) :- ('=<'(A, 1), '>'(B, 2))." 13 | 14 | theory2 = "c(A, B, C, D, positive) :- ('=<'(A, 1), '>'(B, 2), '=='(C, 0)). " \ 15 | + "c(A, B, C, D, positive) :- ('=<'(A, 1.3), '>'(B, 1.8))." 16 | pruned2 = "c(A, B, C, D, positive) :- ('=<'(A, 1.3), '>'(B, 1.8))." 17 | 18 | theory3 = "c(A, B, C, D, positive) :- ('=<'(A, 1.3), '>'(B, 1.8)). " \ 19 | + "c(A, B, C, D, positive) :- ('=<'(A, 1), '>'(B, 2), '=='(C, 0))." 20 | pruned3 = pruned2 21 | 22 | self.assertTrue(theory(parse_theory(pruned1)).equals(prune(mutable_theory(parse_theory(theory1))), False)) 23 | self.assertTrue(theory(parse_theory(pruned2)).equals(prune(mutable_theory(parse_theory(theory2))), False)) 24 | self.assertTrue(theory(parse_theory(pruned3)).equals(prune(mutable_theory(parse_theory(theory3))), False)) 25 | 26 | def test_prune_success(self): 27 | textual_theory = "p(X, Y, inside) :- ('=<'(X, 1), '>'(Y, 2)). " \ 28 | + "p(X, Y, inside) :- ('=<'(X, 0.5), '>'(Y, 3))." 29 | textual_pruned_theory = "p(X, Y, inside) :- ('=<'(X, 1), '>'(Y, 2))." 30 | long_theory = mutable_theory(parse_theory(textual_theory)) 31 | pruned_theory = theory(parse_theory(textual_pruned_theory)) 32 | 33 | self.assertTrue(pruned_theory.equals(prune(long_theory), False)) 34 | 35 | def test_prune_not_applied(self): 36 | textual_theory = "p(PL, PW, SL, SW, versicolor) :- '=<'(SW, 3.6). " \ 37 | + "p(PL, PW, SL, SW, versicolor) :- ('=<'(PW, 0.35), '=<'(SL, 5.35), '=<'(SW, 3.9))." 38 | textual_pruned_theory = textual_theory 39 | long_theory = mutable_theory(parse_theory(textual_theory)) 40 | pruned_theory = theory(parse_theory(textual_pruned_theory)) 41 | 42 | self.assertTrue(pruned_theory.equals(prune(long_theory), False)) 43 | 44 | 45 | if __name__ == '__main__': 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /test/psyke/utils/test_simplify.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from tuprolog.theory import mutable_theory, theory 3 | from tuprolog.theory.parsing import parse_theory 4 | from psyke.utils.logic import simplify 5 | 6 | 7 | class TestSimplify(unittest.TestCase): 8 | 9 | def test_simplify(self): 10 | # TODO: if numbers are not float equals method return false (e.g., 2 instead of 2.0). @Giovanni 2ppy 11 | textual_theory = "p(X, Y, inside) :- ('=<'(X, 1.0), '>'(Y, 2.0), '=<'(X, 0.5))." 12 | textual_simplified_theory = "p(X, Y, inside) :- ('=<'(X, 0.5), '>'(Y, 2.0))." 13 | long_theory = mutable_theory(parse_theory(textual_theory)) 14 | simplified_theory = theory(parse_theory(textual_simplified_theory)) 15 | 16 | self.assertTrue(simplified_theory.equals(simplify(long_theory), False)) 17 | 18 | 19 | if __name__ == '__main__': 20 | unittest.main() 21 | -------------------------------------------------------------------------------- /test/psyke/utils/test_simplify_formatter.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.tree import DecisionTreeRegressor 4 | from psyke import Extractor, get_default_random_seed 5 | from psyke.extraction.hypercubic import Grid 6 | from test import get_dataset 7 | 8 | 9 | class TestSimplifyFormatter(unittest.TestCase): 10 | 11 | def test_simplify_formatter(self): 12 | data = get_dataset('house') 13 | train, test = train_test_split(data, test_size=0.5, random_state=get_default_random_seed()) 14 | predictor = DecisionTreeRegressor() 15 | predictor.fit(train.iloc[:, :-1], train.iloc[:, -1]) 16 | extractor = Extractor.gridrex(predictor, Grid()) 17 | theory = extractor.extract(train) 18 | # print(pretty_theory(theory)) 19 | 20 | 21 | if __name__ == '__main__': 22 | unittest.main() 23 | --------------------------------------------------------------------------------