├── .circleci
    └── config.yml
├── .flake8
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    └── workflows
    │   ├── build.yaml
    │   ├── codeql.yml
    │   └── lint.yaml
├── .gitignore
├── .readthedocs.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── benchmark.ipynb
├── docs
    ├── Makefile
    ├── _static
    │   └── custom.css
    ├── conf.py
    ├── contributing.md
    ├── index.rst
    ├── install.rst
    └── release_notes.rst
├── models
    ├── categorical-test.csv
    ├── gb-gbm-cat-pima-regression.pmml
    ├── gb-gbm-cat-pima.pmml
    ├── gb-xgboost-iris.pmml
    ├── knn-clf-pima.pmml
    ├── knn-reg-pima.pmml
    ├── linear-model-glm.pmml
    ├── linear-model-lasso.pmml
    ├── linear-model-lm.pmml
    ├── linear-model-lmc.pmml
    ├── linear-model-ridge.pmml
    ├── linear-model-ridgec.pmml
    ├── nb-cat-pima.pmml
    ├── nn-iris.pmml
    ├── nn-pima-regression.pmml
    ├── randomForest.pmml
    ├── rf-cat-pima-regression.pmml
    ├── rf-cat-pima.pmml
    ├── rf-iris.pmml
    ├── svc-cat-pima.pmml
    ├── svr-cat-pima.pmml
    ├── tree-cat-pima-regression.pmml
    ├── tree-cat-pima.pmml
    ├── tree-cat.pmml
    ├── tree-digits.pmml
    └── tree-iris.pmml
├── pyproject.toml
├── requirements.txt
├── setup.py
├── sklearn_pmml_model
    ├── __init__.py
    ├── auto_detect
    │   ├── __init__.py
    │   └── base.py
    ├── base.py
    ├── datatypes.py
    ├── ensemble
    │   ├── README.md
    │   ├── __init__.py
    │   ├── _gradient_boosting.pyx
    │   ├── forest.py
    │   └── gb.py
    ├── linear_model
    │   ├── README.md
    │   ├── __init__.py
    │   ├── base.py
    │   └── implementations.py
    ├── naive_bayes
    │   ├── README.md
    │   ├── __init__.py
    │   └── implementations.py
    ├── neighbors
    │   ├── README.md
    │   ├── __init__.py
    │   ├── _base.py
    │   └── _classes.py
    ├── neural_network
    │   ├── README.md
    │   ├── __init__.py
    │   ├── _base.py
    │   └── _classes.py
    ├── svm
    │   ├── README.md
    │   ├── __init__.py
    │   ├── _base.py
    │   └── _classes.py
    └── tree
    │   ├── README.md
    │   ├── __init__.py
    │   ├── _criterion.pxd
    │   ├── _criterion.pyx
    │   ├── _splitter.pxd
    │   ├── _splitter.pyx
    │   ├── _tree.pxd
    │   ├── _tree.pyx
    │   ├── _utils.pxd
    │   ├── _utils.pyx
    │   ├── quad_tree.pxd
    │   ├── quad_tree.pyx
    │   └── tree.py
└── tests
    ├── __init__.py
    ├── auto_detect
        └── test_auto_detect.py
    ├── ensemble
        ├── test_forest.py
        └── test_gb.py
    ├── linear_model
        └── test_linear_model.py
    ├── naive_bayes
        └── test_naive_bayes.py
    ├── neighbors
        └── test_knn.py
    ├── neural_network
        └── test_neural_network.py
    ├── svm
        └── test_svm.py
    ├── test_base.py
    ├── test_datatypes.py
    └── tree
        └── test_tree.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | # Python CircleCI 2.0 configuration file
 2 | version: 2.1
 3 | 
 4 | jobs:
 5 |   build:
 6 |     docker:
 7 |       # specify the version you desire here
 8 |       # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers`
 9 |       - image: circleci/python:3.9.2
10 | 
11 |       # Specify service dependencies here if necessary
12 |       # CircleCI maintains a library of pre-built images
13 |       # documented at https://circleci.com/docs/2.0/circleci-images/
14 |       # - image: circleci/postgres:9.4
15 | 
16 |     working_directory: ~/repo
17 | 
18 |     steps:
19 |       - run: echo 'export PYTHONPATH=$HOME/repo' >> $BASH_ENV
20 | 
21 |       - checkout
22 | 
23 |       # Download and cache dependencies
24 |       - restore_cache:
25 |           keys:
26 |           - v1-dependencies-{{ checksum "requirements.txt" }}
27 |           # fallback to using the latest cache if no exact match is found
28 |           - v1-dependencies-
29 | 
30 |       - run:
31 |           name: install dependencies
32 |           command: |
33 |             python3 -m venv venv
34 |             . venv/bin/activate
35 |             pip install -r requirements.txt
36 |             sudo pip install codecov
37 | 
38 |       - run:
39 |           name: install openjdk
40 |           command: |
41 |             sudo apt update
42 |             sudo apt install default-jdk
43 | 
44 |       - save_cache:
45 |           paths:
46 |             - ./venv
47 |           key: v1-dependencies-{{ checksum "requirements.txt" }}
48 | 
49 |       - run:
50 |           name: run tests
51 |           command: |
52 |             . venv/bin/activate
53 |             python setup.py build_ext --inplace
54 |             pytest --cov=./sklearn_pmml_model tests/
55 | 
56 |       - run:
57 |           name: coverage report
58 |           command: codecov
59 | 
60 |       - store_artifacts:
61 |           path: test-reports
62 |           destination: test-reports
63 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E111,E121,E114
3 |          D100,D105,D102
4 | per-file-ignores =
5 |     __init__.py: D200,D205,D400
6 | docstring-convention = numpy
7 | max-line-length = 120
8 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | #### Description
11 | <!-- 
12 | A clear and concise description of what the bug is.
13 | -->
14 | 
15 | #### Steps/Code to Reproduce
16 | <!--
17 | Please add a minimal example so that we can reproduce the error by running the
18 | code. If possible, link to a PMML file (anonymized if necessary) that causes
19 | problems, or does not yield expected results. In short, we are going to
20 | copy-paste the code and we expect to get the same result as you.
21 | 
22 | If the code is too long, feel free to put it in a public gist and link
23 | it in the issue: https://gist.github.com
24 | -->
25 | 
26 | #### Expected Results
27 | <!-- 
28 | Paste or describe the expected output. For instance, share some input data and
29 | corresponding prediction probabilities generated from the original model.
30 | before converting the model to PMML. 
31 | -->
32 | 
33 | #### Actual Results
34 | <!-- 
35 | In case of an error, please paste or specifically describe the actual output
36 | or traceback.
37 | -->
38 | 
39 | #### Versions
40 | <!--
41 | Please run the following snippet and paste the output below.
42 | import platform; print(platform.platform())
43 | import sys; print("Python", sys.version)
44 | import numpy; print("NumPy", numpy.__version__)
45 | import scipy; print("SciPy", scipy.__version__)
46 | import sklearn; print("Scikit-Learn", sklearn.__version__)
47 | import sklearn_pmml_model; print("sklearn-pmml-model", sklearn_pmml_model.__version__)
48 | -->
49 | 
50 | 
51 | <!-- Thanks for contributing! -->
52 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | #### Description
11 | <!-- Is your feature request related to a problem? Please describe. ⟶
12 | 
13 | #### Solution
14 | <!-- A clear and concise description of what you want to happen. ⟶
15 | 
16 | #### Alternatives
17 | <!-- Mention any alternative solutions or features you have considered. -->
18 | 
19 | #### Additional context
20 | <!-- Add any other context or screenshots about the feature request here. -->
21 | 
22 | <!-- Thanks for contributing! -->
23 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yaml:
--------------------------------------------------------------------------------
 1 | name: Build and upload to PyPI
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |   release:
 7 |     types:
 8 |       - published
 9 | 
10 | jobs:
11 |   build_wheels:
12 |     name: Build wheels on ${{ matrix.os }}
13 |     runs-on: ${{ matrix.os }}
14 |     strategy:
15 |       matrix:
16 |         # macos-13 is an intel runner, macos-14 is apple silicon
17 |         os: [windows-latest, macos-13, macos-14, ubuntu-latest]
18 | 
19 |     steps:
20 |       - uses: actions/checkout@v4
21 | 
22 |       - uses: actions/setup-python@v5
23 |         name: Install Python
24 |         with:
25 |           python-version: '3.12'
26 | 
27 |       - name: Build wheels
28 |         uses: pypa/cibuildwheel@v2.17.0
29 | 
30 |       - uses: actions/upload-artifact@v4
31 |         with:
32 |           name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
33 |           path: ./wheelhouse/*.whl
34 | 
35 |   build_sdist:
36 |     name: Build source distribution
37 |     runs-on: ubuntu-latest
38 |     steps:
39 |       - uses: actions/checkout@v4
40 | 
41 |       - uses: actions/setup-python@v5
42 |         name: Install Python
43 |         with:
44 |           python-version: '3.12'
45 | 
46 |       - name: Install dependencies
47 |         run: python -m pip install cython numpy setuptools
48 | 
49 |       - name: Build sdist
50 |         run: python setup.py sdist
51 | 
52 |       - uses: actions/upload-artifact@v4
53 |         with:
54 |           name: cibw-sdist
55 |           path: dist/*.tar.gz
56 | 
57 |   upload_pypi:
58 |     name: Upload to PyPI
59 |     needs: [build_wheels, build_sdist]
60 |     runs-on: ubuntu-latest
61 |     if: github.event_name == 'release' && github.event.action == 'published'
62 |     steps:
63 |       - uses: actions/download-artifact@v4
64 |         with:
65 |           pattern: cibw-*
66 |           path: dist
67 |           merge-multiple: true
68 | 
69 |       - uses: pypa/gh-action-pypi-publish@release/v1
70 |         with:
71 |           user: __token__
72 |           password: ${{ secrets.pypi_password }}
73 |           skip_existing: true


--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | name: "CodeQL"
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ 'master' ]
 6 |   pull_request:
 7 |     # The branches below must be a subset of the branches above
 8 |     branches: [ 'master' ]
 9 |   schedule:
10 |     - cron: '52 16 * * 0'
11 | 
12 | jobs:
13 |   analyze:
14 |     name: Analyze
15 |     runs-on: ubuntu-latest
16 |     permissions:
17 |       actions: read
18 |       contents: read
19 |       security-events: write
20 | 
21 |     strategy:
22 |       fail-fast: false
23 |       matrix:
24 |         language: [ 'python' ]
25 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
26 |         # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
27 | 
28 |     steps:
29 |     - name: Checkout repository
30 |       uses: actions/checkout@v3
31 | 
32 |     # Initializes the CodeQL tools for scanning.
33 |     - name: Initialize CodeQL
34 |       uses: github/codeql-action/init@v2
35 |       with:
36 |         languages: ${{ matrix.language }}
37 |         # If you wish to specify custom queries, you can do so here or in a config file.
38 |         # By default, queries listed here will override any specified in a config file.
39 |         # Prefix the list here with "+" to use these queries and those in the config file.
40 | 
41 |         # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
42 |         queries: +security-and-quality
43 | 
44 | 
45 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, Go, or Java).
46 |     # If this step fails, then you should remove it and run the build manually (see below)
47 |     - name: Autobuild
48 |       uses: github/codeql-action/autobuild@v2
49 | 
50 |     # ℹ️ Command-line programs to run using the OS shell.
51 |     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
52 | 
53 |     #   If the Autobuild fails above, remove it and uncomment the following three lines.
54 |     #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
55 | 
56 |     # - run: |
57 |     #   echo "Run, Build Application using script"
58 |     #   ./location_of_script_within_repo/buildscript.sh
59 | 
60 |     - name: Perform CodeQL Analysis
61 |       uses: github/codeql-action/analyze@v2
62 |       with:
63 |         category: "/language:${{matrix.language}}"
64 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yaml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   flake8:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v2
10 | 
11 |       - uses: actions/setup-python@v2
12 |         name: Install Python
13 |         with:
14 |           python-version: '3.8'
15 | 
16 |       - name: Install flake8
17 |         run: pip install flake8 flake8-docstrings
18 | 
19 |       - name: Run flake8
20 |         run: flake8 sklearn_pmml_model
21 | 
22 |   commitlint:
23 |     runs-on: ubuntu-latest
24 |     steps:
25 |       - uses: actions/checkout@v2
26 |         with:
27 |           fetch-depth: 0
28 |       - uses: wagoid/commitlint-github-action@v4


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | *.c
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | db.sqlite3
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # Environments
 86 | .env
 87 | .venv
 88 | env/
 89 | venv/
 90 | ENV/
 91 | env.bak/
 92 | venv.bak/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | .spyproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 
101 | # mkdocs documentation
102 | /site
103 | 
104 | # mypy
105 | .mypy_cache/
106 | 
107 | # Intellij IDEA
108 | .idea/
109 | *.iml
110 | 
111 | models/generate_pmml.R


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Required
 2 | version: 2
 3 | 
 4 | # Set the version of Python and other tools you might need
 5 | build:
 6 |   os: ubuntu-20.04
 7 |   tools:
 8 |     python: "3.9"
 9 | 
10 | # Build documentation in the docs/ directory with Sphinx
11 | sphinx:
12 |    configuration: docs/conf.py
13 | 
14 | 
15 | # Optionally declare the Python requirements required to build your docs
16 | python:
17 |    install:
18 |    - requirements: requirements.txt


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | d.collaris@me.com.
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series
 86 | of actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or
 93 | permanent ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior,  harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 | 
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 | 
124 | [homepage]: https://www.contributor-covenant.org
125 | 
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | When contributing to this repository, please first discuss the change you wish to make via a GitHub issue, email, or any other method with the owners of this repository before making a change. 
  4 | 
  5 | Please note we have a [code of conduct](https://github.com/iamDecode/sklearn-pmml-model/blob/master/CODE_OF_CONDUCT.md), please follow it in all your interactions with the project.
  6 | 
  7 | 
  8 | ## Scope of this package
  9 | 
 10 | The scope of `sklearn-pmml-model` is to import functionality to all major estimator classes of the popular machine learning library [scikit-learn](https://scikit-learn.org) using [PMML](http://dmg.org/pmml/v4-4/GeneralStructure.html).
 11 | 
 12 | The API is designed to closely resemble the `scikit-learn` API. The same directory and component structure is used, and each estimator is a sub-class of a corresponding estimator. Note that some models may not have a `scikit-learn` implementation (e.g., Bayesian networks) and hence cannot currently be represented.
 13 | 
 14 | We intend for the library to remain as light-weight as possible, and stick with the minimum number of additions to enable PMML import functionality without affecting the outward facing API of estimators.
 15 | 
 16 | 
 17 | ## Reporting bugs
 18 | 
 19 | We use GitHub issues to track all bugs and feature requests; feel free to open an issue if you have found a bug or wish to see a feature implemented.
 20 | 
 21 | It is recommended to check that your issue complies with the  following rules before submitting:
 22 | 
 23 | - Verify that your issue is not being currently addressed by other [issues](https://github.com/iamDecode/sklearn-pmml-model/issues) or [pull requests](https://github.com/iamDecode/sklearn-pmml-model/pulls).
 24 | - Please include code snippets or error messages when reporting issues. When doing so, please make sure to format them using code blocks. See [Creating and highlighting code blocks](https://help.github.com/articles/creating-and-highlighting-code-blocks).
 25 | - It can often be helpful to include your operating system type and version number, as well as your Python, sklearn-pmml-model, scikit-learn, numpy, and scipy versions. This information can be found by running the following code snippet:
 26 | ```python
 27 | import platform; print(platform.platform())
 28 | import sys; print("Python", sys.version)
 29 | import numpy; print("NumPy", numpy.__version__)
 30 | import scipy; print("SciPy", scipy.__version__)
 31 | import sklearn; print("Scikit-Learn", sklearn.__version__)
 32 | import sklearn_pmml_model; print("sklearn-pmml-model", sklearn_pmml_model.__version__)
 33 | ```
 34 | 
 35 | 
 36 | ## Get a local copy
 37 | 
 38 | These are the steps you need to take to create a copy of the `sklearn-pmml-model` repository on your computer.
 39 | 
 40 | 1. [Create an account](https://github.com/join) on GitHub if you do not already have one.
 41 | 
 42 | 2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [`sklearn-pmml-model` repository](https://github.com/iamDecode/sklearn-pmml-model).
 43 | 
 44 | 3. Clone your fork of the `sklearn-pmml-model` repository from your GitHub account. Use a git GUI application (e.g., Sourcetree, GitKraken) or from command line, run:
 45 | 
 46 |    ```
 47 |    $ git clone git@github.com:iamDecode/sklearn-pmml-model.git
 48 |    $ cd sklearn-pmml-model
 49 |    ```
 50 | 
 51 | 4. Create a feature branch to hold your development changes:
 52 | 
 53 |    ```
 54 |    $ git checkout -b <username>/<feature description>
 55 |    ```
 56 | 
 57 |    (For example: `decode/regression-trees`)
 58 | 
 59 | 
 60 | ## Setting up a development environment
 61 | 
 62 | After you created a copy of our main repository on GitHub, your need to setup a local development environment.  We recommend creating a virtual environment and activating it:
 63 | ```
 64 | $ python3 -m venv venv
 65 | $ source venv/bin/activate
 66 | ```
 67 | 
 68 | and install the dependencies within the virtual environment:
 69 | 
 70 | ```
 71 | $ pip install -r requirements.txt
 72 | ```
 73 | 
 74 | The final step is to build the Cython extensions (you need to rebuilt once you make changes to the Cython code):
 75 | 
 76 | ```
 77 | $ python setup.py build_ext --inplace
 78 | ```
 79 | 
 80 | ## Making changes to the code
 81 | 
 82 | For pull requests to be accepted, your changes must at least meet the following requirements:
 83 | 
 84 | 1. All changes related to *one feature* must belong to *one branch*. Each branch must be self-contained, with a single new feature or bugfix.
 85 | 2. Commit messages should be formulated according to [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/).
 86 | 3. If your pull request addresses an issue, please make sure to [link back](https://github.blog/changelog/2020-12-15-reference-issues-discussions-and-pull-requests-faster-with-multi-word-suggestions/) to the original issue.
 87 | 4. Follow the [PEP8 style guide](https://www.python.org/dev/peps/pep-0008/). With the following exceptions or additions:
 88 |    - The max line length is 120 characters instead of 80.
 89 |    - Indents with double spaces, not 4 spaces or tabs.
 90 |   
 91 |    You can check for compliance locally by running:
 92 |    ```
 93 |    $ flake8 sklearn_pmml_model
 94 |    ```
 95 | 5. Each function, class, method, and attribute needs to be documented using docstrings. `sklearn-pmml-model` conforms to the [numpy docstring standard](https://numpydoc.readthedocs.io/en/latest/format.html#docstring-standard). 
 96 | 6. Finally, ensure all the test cases still pass after you have made your changes. To test locally, you can run:
 97 |    ```
 98 |    $ python setup.py pytest
 99 |    ```
100 | 
101 | In addition to these requirements, we strongly prefer you to consider the following guidelines. However, they are not strictly required to not be overly prohibitive to new contributors.
102 | 
103 | 7. Your change should include test cases for all new functionality being introduced.
104 | 8. No additional code style issues should be reported by [LGTM](https://lgtm.com).
105 | 
106 | Continuous integration will automatically verify compliance with all of the discussed requirements.
107 | 
108 | 
109 | 
110 | ## Submitting a Pull Request
111 | 
112 | 1. When you are done coding in your feature branch, [add changed or new files](https://git-scm.com/book/en/v2/Git-Basics-Recording-Changes-to-the-Repository#_tracking_files>):
113 |    ```
114 |    $ git add path/to/modified_file
115 |    ```
116 | 2. Create a [commit](https://git-scm.com/book/en/v2/Git-Basics-Recording-Changes-to-the-Repository#_committing_changes) with a message describing what you changed. Commit messages should be formulated according to [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) standard: 
117 |    ```
118 |    $ git commit
119 |    ```
120 | 3. Push the changes to GitHub:
121 |    ```
122 |    $ git push -u origin my_feature
123 |    ```
124 | 4. [Create a pull request](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request).


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2018, Dennis Collaris
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <img src="https://user-images.githubusercontent.com/1223300/41346080-c2c910a0-6f05-11e8-89e9-71a72bb9543f.png" width="300">
  2 | 
  3 | # sklearn-pmml-model
  4 | 
  5 | [![PyPI version](https://badge.fury.io/py/sklearn-pmml-model.svg)](https://badge.fury.io/py/sklearn-pmml-model)
  6 | [![codecov](https://codecov.io/gh/iamDecode/sklearn-pmml-model/branch/master/graph/badge.svg?token=CGbbgziGwn)](https://codecov.io/gh/iamDecode/sklearn-pmml-model)
  7 | [![CircleCI](https://circleci.com/gh/iamDecode/sklearn-pmml-model.svg?style=shield)](https://circleci.com/gh/iamDecode/sklearn-pmml-model)
  8 | [![ReadTheDocs](https://readthedocs.org/projects/sklearn-pmml-model/badge/?version=latest&style=flat)](https://sklearn-pmml-model.readthedocs.io/en/latest/)
  9 | 
 10 | A library to effortlessly import models trained on different platforms and with programming languages into scikit-learn in Python. First export your model to [PMML](http://dmg.org/pmml/v4-3/GeneralStructure.html) (widely supported). Next, load the exported PMML file with this library, and use the class as any other scikit-learn estimator.
 11 | 
 12 | 
 13 | ## Installation
 14 | 
 15 | The easiest way is to use pip:
 16 | 
 17 | ```
 18 | $ pip install sklearn-pmml-model
 19 | ```
 20 | 
 21 | ## Status
 22 | The library currently supports the following models:
 23 | 
 24 | | Model                                                  | Classification | Regression | Categorical features |
 25 | |--------------------------------------------------------|----------------|------------|----------------------|
 26 | | [Decision Trees](sklearn_pmml_model/tree)              | ✅             | ✅         | ✅<sup>1</sup>        |
 27 | | [Random Forests](sklearn_pmml_model/ensemble)          | ✅             | ✅         | ✅<sup>1</sup>        |
 28 | | [Gradient Boosting](sklearn_pmml_model/ensemble)       | ✅             | ✅         | ✅<sup>1</sup>        |
 29 | | [Linear Regression](sklearn_pmml_model/linear_model)   | ✅             | ✅         | ✅<sup>3</sup>        |
 30 | | [Ridge](sklearn_pmml_model/linear_model)               | ✅<sup>2</sup> | ✅         | ✅<sup>3</sup>        |
 31 | | [Lasso](sklearn_pmml_model/linear_model)               | ✅<sup>2</sup> | ✅         | ✅<sup>3</sup>        |
 32 | | [ElasticNet](sklearn_pmml_model/linear_model)          | ✅<sup>2</sup> | ✅         | ✅<sup>3</sup>        |
 33 | | [Gaussian Naive Bayes](sklearn_pmml_model/naive_bayes) | ✅             |            | ✅<sup>3</sup>        |
 34 | | [Support Vector Machines](sklearn_pmml_model/svm)      | ✅             | ✅         | ✅<sup>3</sup>        |
 35 | | [Nearest Neighbors](sklearn_pmml_model/neighbors)      | ✅             | ✅         |                      |
 36 | | [Neural Networks](sklearn_pmml_model/neural_network)   | ✅             | ✅         |                      |
 37 | 
 38 | <sub><sup>1</sup> Categorical feature support using slightly modified internals, based on [scikit-learn#12866](https://github.com/scikit-learn/scikit-learn/pull/12866).</sub>
 39 | 
 40 | <sub><sup>2</sup> These models differ only in training characteristics, the resulting model is of the same form. Classification is supported using `PMMLLogisticRegression` for regression models and `PMMLRidgeClassifier` for general regression models.</sub>
 41 | 
 42 | <sub><sup>3</sup> By one-hot encoding categorical features automatically.</sub>
 43 |   
 44 | ## Example
 45 | A minimal working example (using [this PMML file](https://github.com/iamDecode/sklearn-pmml-model/blob/master/models/randomForest.pmml)) is shown below:
 46 | 
 47 | ```python
 48 | from sklearn.datasets import load_iris
 49 | from sklearn.model_selection import train_test_split
 50 | import pandas as pd
 51 | import numpy as np
 52 | from sklearn_pmml_model.ensemble import PMMLForestClassifier
 53 | from sklearn_pmml_model.auto_detect import auto_detect_estimator
 54 | 
 55 | # Prepare the data
 56 | iris = load_iris()
 57 | X = pd.DataFrame(iris.data)
 58 | X.columns = np.array(iris.feature_names)
 59 | y = pd.Series(np.array(iris.target_names)[iris.target])
 60 | y.name = "Class"
 61 | Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.33, random_state=123)
 62 | 
 63 | # Specify the model type for the least overhead...
 64 | #clf = PMMLForestClassifier(pmml="models/randomForest.pmml")
 65 | 
 66 | # ...or simply let the library auto-detect the model type
 67 | clf = auto_detect_estimator(pmml="models/randomForest.pmml")
 68 | 
 69 | # Use the model as any other scikit-learn model
 70 | clf.predict(Xte)
 71 | clf.score(Xte, yte)
 72 | ```
 73 | 
 74 | More examples can be found in the subsequent packages: [tree](sklearn_pmml_model/tree), [ensemble](sklearn_pmml_model/ensemble), [linear_model](sklearn_pmml_model/linear_model), [naive_bayes](sklearn_pmml_model/naive_bayes), [svm](sklearn_pmml_model/svm), [neighbors](sklearn_pmml_model/neighbors) and [neural_network](sklearn_pmml_model/neural_network).
 75 | 
 76 | ## Benchmark
 77 | 
 78 | Depending on the data set and model, `sklearn-pmml-model` is between 1 and 10 times faster than competing libraries, by leveraging the optimization and industry-tested robustness of `sklearn`. Source code for this benchmark can be found in the corresponding [jupyter notebook](benchmark.ipynb). 
 79 | 
 80 | 
 81 | ### Running times (load + predict, in seconds)
 82 | 
 83 | |               |                     | Linear model | Naive Bayes | Decision tree | Random Forest | Gradient boosting |
 84 | |---------------|---------------------|--------------|-------------|---------------|---------------|-------------------|
 85 | | Wine          | `PyPMML`            | 0.013038     | 0.005674    | 0.005587      | 0.032734      | 0.034649          |
 86 | |               | `sklearn-pmml-model`| 0.00404      | 0.004059    | 0.000964      | 0.030008      | 0.032949          |
 87 | | Breast cancer | `PyPMML`            | 0.009838     | 0.01153     | 0.009367      | 0.058941      | 0.031196          |
 88 | |               | `sklearn-pmml-model`| 0.010749     | 0.008481    | 0.001106      | 0.044021      | 0.013411          |
 89 | 
 90 | ### Improvement
 91 | 
 92 | |               |                    | Linear model | Naive Bayes | Decision tree | Random Forest | Gradient boosting |
 93 | |---------------|--------------------|--------------|-------------|---------------|---------------|-------------------|
 94 | | Wine          | Improvement        | 3.23×        | 1.40×       | 5.80×         | 1.09×         | 1.05×             |
 95 | | Breast cancer | Improvement        | 0.91×        | 1.36×       | **8.47×**     | 1.34×         | 2.33×             |
 96 | 
 97 | *Benchmark ran on: 24 september 2024 17:19*
 98 | 
 99 | ## Development
100 | 
101 | ### Prerequisites
102 | 
103 | Tests can be run using Py.test. Grab a local copy of the source:
104 | 
105 | ```
106 | $ git clone http://github.com/iamDecode/sklearn-pmml-model
107 | $ cd sklearn-pmml-model
108 | ```
109 | 
110 | create a virtual environment and activating it:
111 | ```
112 | $ python3 -m venv venv
113 | $ source venv/bin/activate
114 | ```
115 | 
116 | and install the dependencies:
117 | 
118 | ```
119 | $ pip install -r requirements.txt
120 | ```
121 | 
122 | The final step is to build the Cython extensions:
123 | 
124 | ```
125 | $ python setup.py build_ext --inplace
126 | ```
127 | 
128 | ### Testing
129 | 
130 | You can execute tests with py.test by running:
131 | ```
132 | $ python setup.py pytest
133 | ```
134 | 
135 | ## Contributing
136 | 
137 | Feel free to make a contribution. Please read [CONTRIBUTING.md](CONTRIBUTING.md) for more details.
138 | 
139 | ## License
140 | 
141 | This project is licensed under the BSD 2-Clause License - see the [LICENSE](LICENSE) file for details.
142 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_static/custom.css:
--------------------------------------------------------------------------------
  1 | .navbar .container-xl {
  2 |     padding-left: 15px !important;
  3 |     padding-right: 15px !important;
  4 | }
  5 | 
  6 | #navbar-icon-links {
  7 |     display: none;
  8 | }
  9 | @media (min-width: 960px) {
 10 |     #navbar-icon-links {
 11 |         display: flex;
 12 |     }
 13 | }
 14 | 
 15 | .fas.pypi {
 16 |     font-size: 20px !important;
 17 |     margin-top: -2px;
 18 | }
 19 | 
 20 | .tile {
 21 |     position: relative;
 22 |     display: block;
 23 |     padding: 1rem 1.2rem;
 24 |     border-radius: 6px;
 25 |     box-shadow: 0 1px 5px rgba(0, 0, 0, 0.2)
 26 | }
 27 | 
 28 | .tile .tile-title {
 29 |     position: relative;
 30 |     font-size: 22px;
 31 |     line-height: 1.4;
 32 |     margin-top: 0;
 33 |     margin-bottom: .5rem;
 34 | }
 35 | 
 36 | .tile .tile-title .tile-icon {
 37 |     position: absolute;
 38 |     right: 0;
 39 |     line-height: 1.4;
 40 |     transition: all .2s;
 41 | }
 42 | 
 43 | .tile h3.tile-title  {
 44 |     color: rgba(var(--pst-color-link),1);
 45 | }
 46 | 
 47 | .tile h3.tile-title:hover {
 48 |     color:rgba(var(--pst-color-link-hover),1);
 49 | }
 50 | 
 51 | .tile h3.tile-title:before {
 52 |     height: 0;
 53 |     margin: 0;
 54 | }
 55 | 
 56 | .tile .tile-desc {
 57 |     margin-top: 1rem;
 58 |     color: #646976
 59 | }
 60 | 
 61 | .tile .tile-desc p:last-child {
 62 |     margin-bottom: 0
 63 | }
 64 | 
 65 | .tile:after {
 66 |     content: '';
 67 |     position: absolute;
 68 |     right: 0;
 69 |     bottom: 0;
 70 |     left: 0;
 71 |     height: 3px;
 72 |     background: #4ce8ff;
 73 |     background: linear-gradient(90deg, #4ce8ff, #d07cff);
 74 |     opacity: 0;
 75 |     transition: all .2s
 76 | }
 77 | 
 78 | .tile:hover:after {
 79 |     opacity: 1
 80 | }
 81 | 
 82 | .tile:hover .tile-icon {
 83 |     transform: scale(1.4)
 84 | }
 85 | 
 86 | a.tile-link:active,
 87 | a.tile-link:link,
 88 | a.tile-link:hover,
 89 | a.tile-link:focus {
 90 |     text-decoration: none;
 91 | }
 92 | 
 93 | .tile .tile-desc pre {
 94 |     font-size: 80%;
 95 |     white-space: pre-line;
 96 | }
 97 | 
 98 | main > div > .reference.internal.image-reference img {
 99 |     margin-bottom: 25px;
100 | }
101 | 
102 | .sig-prename {
103 |     display: none;
104 | }
105 | 
106 | .section > dl.py {
107 |     margin-top: 70px;
108 | }


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | # Configuration file for the Sphinx documentation builder.
 4 | #
 5 | # This file only contains a selection of the most common options. For a full
 6 | # list see the documentation:
 7 | # http://www.sphinx-doc.org/en/master/config
 8 | 
 9 | # -- Path setup --------------------------------------------------------------
10 | 
11 | # If extensions (or modules to document with autodoc) are in another directory,
12 | # add these directories to sys.path here. If the directory is relative to the
13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
14 | #
15 | # import os
16 | # import sys
17 | # sys.path.insert(0, os.path.abspath('.'))
18 | 
19 | 
20 | # -- Project information -----------------------------------------------------
21 | 
22 | project = 'sklearn-pmml-model'
23 | current_year = datetime.utcnow().year
24 | copyright = f'2018 - {current_year}, Dennis Collaris'
25 | author = 'Dennis Collaris'
26 | 
27 | # -- General configuration ---------------------------------------------------
28 | 
29 | # Add any Sphinx extension module names here, as strings. They can be
30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
31 | # ones.
32 | extensions = [
33 |   'autoapi.extension',
34 |   'numpydoc',
35 |   'sphinx_github_changelog',
36 |   'myst_parser',
37 | ]
38 | 
39 | # Add any paths that contain templates here, relative to this directory.
40 | templates_path = ['_templates']
41 | 
42 | # List of patterns, relative to source directory, that match files and
43 | # directories to ignore when looking for source files.
44 | # This pattern also affects html_static_path and html_extra_path.
45 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
46 | 
47 | # -- Options for HTML output -------------------------------------------------
48 | 
49 | # The theme to use for HTML and HTML Help pages.  See the documentation for
50 | # a list of builtin themes.
51 | #
52 | html_theme = 'pydata_sphinx_theme'
53 | html_title = project
54 | 
55 | html_theme_options = {
56 |   "collapse_navigation": False,
57 |   "icon_links": [
58 |     {
59 |       "name": "GitHub",
60 |       "url": "https://github.com/iamDecode/sklearn-pmml-model",
61 |       "icon": "fab fa-github-square",
62 |     },
63 |     {
64 |       "name": "PyPI",
65 |       "url": "https://pypi.org/project/sklearn-pmml-model",
66 |       "icon": "fas fa-box pypi",
67 |     },
68 |   ]
69 | }
70 | 
71 | # Add any paths that contain custom static files (such as style sheets) here,
72 | # relative to this directory. They are copied after the builtin static files,
73 | # so a file named "default.css" will overwrite the builtin "default.css".
74 | html_static_path = ['_static']
75 | 
76 | master_doc = 'index'
77 | 
78 | # -- AutoAPI -----------------------------------------------------------------
79 | autoapi_type = 'python'
80 | autoapi_dirs = ['../sklearn_pmml_model']
81 | 
82 | 
83 | def setup(app):
84 |   app.add_css_file('custom.css')
85 | 


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
1 | ../CONTRIBUTING.md


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. image:: https://user-images.githubusercontent.com/1223300/41346080-c2c910a0-6f05-11e8-89e9-71a72bb9543f.png
 2 |    :width: 300px
 3 |    :alt: sklearn-pmml-model
 4 | 
 5 | Documentation
 6 | =============
 7 | 
 8 | A library to effortlessly import models trained on different platforms and with programming languages into scikit-learn in Python. First export your model to PMML (widely supported). Next, load the exported PMML file with this library, and use the class as any other scikit-learn estimator.
 9 | 
10 | .. raw:: html
11 | 
12 |     <div class="row mb-5">
13 |       <div class="col-md-6 col-xs-12 d-flex p-1">
14 |         <div class="flex-fill tile">
15 |           <a class="tile-link" href="install.html">
16 |             <h3 class="tile-title">Install
17 |             <i class="fas fa-download tile-icon"></i>
18 |             </h3>
19 |           </a>
20 |           <div class="tile-desc">
21 |             <p>The easiest way to install <code>sklearn-pmml-model</code> is to use <a class="reference external" href="https://packaging.python.org/tutorials/installing-packages/#use-pip-for-installing">pip</a> by running:</p>
22 |               <div class="highlight-default notranslate">
23 |                 <div class="highlight">
24 |                   <pre>$ pip install sklearn-pmml-model</pre>
25 |                 </div>
26 |               </div>
27 | 
28 |             <p>Alternatively, you can install from source using the details described on <a class="reference external" href="https://github.com/iamDecode/sklearn-pmml-model">GitHub</a>.</p>
29 |           </div>
30 |         </div>
31 |       </div>
32 |       <div class="col-lg-6 col-md-6 col-sm-6 col-xs-12 d-flex p-1">
33 |         <div class="flex-fill tile">
34 |           <a class="tile-link" href="autoapi/index.html">
35 |             <h3 class="tile-title">API Reference
36 |             <i class="fas fa-cogs tile-icon"></i>
37 |             </h3>
38 |             <div class="tile-desc">
39 |               <p>The reference guide contains a detailed description of the <code>sklearn-pmml-model</code> API. It describes which classes and functions are available along with their arguments.</p>
40 |             </div>
41 |           </a>
42 |         </div>
43 |       </div>
44 |     </div>
45 | 
46 | .. toctree::
47 |     :maxdepth: 4
48 |     :caption: Contents:
49 | 
50 |     Install <install>
51 |     Contribute <contributing>
52 |     Release notes <release_notes>
53 | 
54 | 


--------------------------------------------------------------------------------
/docs/install.rst:
--------------------------------------------------------------------------------
 1 | Installing sklearn-pmml-model
 2 | =============================
 3 | 
 4 | The easiest way to install :code:`sklearn-pmml-model` is with :ref:`install-pip`. Alternatively, you can install it :ref:`from source<install-from-source>`.
 5 | 
 6 | .. _install-pip:
 7 | 
 8 | pip
 9 | --------
10 | 
11 | Pre-built binary packages (wheels) are provided for Linux, MacOS, and Windows through PyPI.
12 | To install using :code:`pip`, simply run::
13 | 
14 |   $ pip install sklearn-pmml-model
15 | 
16 | More details on using `pip` can be found `here <https://packaging.python.org/tutorials/installing-packages/#use-pip-for-installing>`_.
17 | 
18 | .. _install-from-source:
19 | 
20 | From source
21 | -----------
22 | 
23 | If you want to build :code:`sklearn-pmml-model` from source, you
24 | will need a C/C++ compiler to compile extensions.
25 | 
26 | **Linux**
27 | 
28 | On Linux, you need to install :code:`gcc`, which in most cases is available
29 | via your distribution's packaging system.
30 | Please follow your distribution's instructions on how to install packages.
31 | 
32 | **MacOS**
33 | 
34 | On MacOS, you need to install :code:`clang`, which is available from
35 | the *Command Line Tools* package. Open a terminal and execute::
36 | 
37 |   $ xcode-select --install
38 | 
39 | Alternatively, you can download it from the
40 | `Apple Developers page <https://developer.apple.com/downloads/index.action>`_.
41 | Log in with your Apple ID, then search and download the
42 | *Command Line Tools for Xcode* package.
43 | 
44 | **Windows**
45 | 
46 | On Windows, the compiler you need depends on the Python version
47 | you are using. See `this guide <https://wiki.python.org/moin/WindowsCompilers>`_
48 | to determine which Microsoft Visual C++ compiler to use with a specific Python version.
49 | 
50 | **Installing**
51 | 
52 | Grab a local copy of the source::
53 | 
54 |   $ git clone http://github.com/iamDecode/sklearn-pmml-model
55 |   $ cd sklearn-pmml-model
56 | 
57 | create a virtual environment and activate it::
58 | 
59 |   $ python3 -m venv venv
60 |   $ source venv/bin/activate
61 | 
62 | and install the dependencies::
63 | 
64 |   $ pip install -r requirements.txt
65 | 
66 | The final step is to build the Cython extensions (this part requires the C/C++ compiler)::
67 | 
68 |   $ python setup.py build_ext --inplace
69 | 
70 | 
71 | .. _dependencies:
72 | 
73 | Dependencies
74 | ------------
75 | 
76 | The current minimum dependencies to run :code:`sklearn-pmml-model` are:
77 | 
78 | - numpy 1.16 or later
79 | - pandas
80 | - scikit-learn
81 | - scipy
82 | - cached-property


--------------------------------------------------------------------------------
/docs/release_notes.rst:
--------------------------------------------------------------------------------
1 | Release notes
2 | =============
3 | 
4 | .. changelog::
5 |     :changelog-url: https://sklearn-pmml-model.readthedocs.io/en/latest/release_notes.html
6 |     :github: https://github.com/iamDecode/sklearn-pmml-model/releases
7 |     :pypi: https://pypi.org/project/sklearn-pmml-model


--------------------------------------------------------------------------------
/models/categorical-test.csv:
--------------------------------------------------------------------------------
 1 | "type","npreg","glu","bp","skin","bmi","ped","age"
 2 | "Yes",2,128,78,37,43.3,1.224,"(30,40]"
 3 | "Yes",12,92,62,7,27.6,0.926,"(40,50]"
 4 | "Yes",11,143,94,33,36.6,0.254,"(50,60]"
 5 | "Yes",9,164,84,21,30.8,0.831,"(30,40]"
 6 | "Yes",8,176,90,34,33.7,0.467,"(50,60]"
 7 | "Yes",8,154,78,32,32.4,0.443,"(40,50]"
 8 | "Yes",5,139,80,35,31.6,0.361,"(20,30]"
 9 | "Yes",5,158,84,41,39.4,0.395,"(20,30]"
10 | "Yes",1,115,70,30,34.6,0.529,"(30,40]"
11 | "Yes",7,150,78,29,35.2,0.692,"(50,60]"
12 | "Yes",10,148,84,48,37.6,1.001,"(50,60]"
13 | "Yes",3,129,92,49,36.4,0.968,"(30,40]"
14 | "Yes",0,198,66,32,41.3,0.502,"(20,30]"
15 | "Yes",0,188,82,14,32,0.682,"(20,30]"
16 | "Yes",3,158,76,36,31.6,0.851,"(20,30]"
17 | "Yes",0,151,90,46,42.1,0.371,"(20,30]"
18 | "Yes",0,95,85,25,37.4,0.247,"(20,30]"
19 | "Yes",14,175,62,30,33.6,0.212,"(30,40]"
20 | "Yes",7,129,68,49,38.5,0.439,"(40,50]"
21 | "Yes",8,155,62,26,34,0.543,"(40,50]"
22 | "Yes",1,180,78.5224821507334,35.5864693480366,43.3,0.282,"(40,50]"
23 | "Yes",8,125,96,33.8348977342098,34.7868326916975,0.232,"(50,60]"
24 | "Yes",2,118,80,30.6522443724509,42.9,0.693,"(20,30]"
25 | "Yes",8,133,72,32.6743195210369,32.9,0.27,"(30,40]"
26 | "Yes",0,141,76.1270147394716,35.5136628118178,42.4,0.205,"(20,30]"
27 | "Yes",3,141,72.1571733202164,30.8080996185329,30,0.761,"(20,30]"
28 | "No",0,165,76,43,47.9,0.259,"(20,30]"
29 | "No",4,99,76,15,23.2,0.223,"(20,30]"
30 | "No",2,99,70,16,20.4,0.235,"(20,30]"
31 | "No",2,110,74,29,32.4,0.698,"(20,30]"
32 | "No",3,148,66,25,32.5,0.256,"(20,30]"
33 | "No",1,71,48,18,20.4,0.323,"(20,30]"
34 | "No",0,119,66,27,38.8,0.259,"(20,30]"
35 | "No",1,97,68,21,27.2,1.095,"(20,30]"
36 | "No",0,86,68,32,35.8,0.238,"(20,30]"
37 | "No",2,125,60,20,33.8,0.088,"(30,40]"
38 | "No",1,114,66,36,38.1,0.289,"(20,30]"
39 | "No",6,92,62,32,32,0.085,"(40,50]"
40 | "No",0,135,94,46,40.6,0.284,"(20,30]"
41 | "No",2,121,70,32,39.1,0.886,"(20,30]"
42 | "No",12,121,78,17,26.5,0.259,"(60,70]"
43 | "No",4,110,76,20,28.4,0.118,"(20,30]"
44 | "No",3,61,82,28,34.4,0.243,"(40,50]"
45 | "No",6,111,64,39,34.2,0.26,"(20,30]"
46 | "No",1,81,74,41,46.3,1.096,"(30,40]"
47 | "No",0,137,84,27,27.3,0.231,"(50,60]"
48 | "No",1,97,70,40,38.1,0.218,"(20,30]"
49 | "No",2,105,58,40,34.9,0.225,"(20,30]"
50 | "No",0,100,88,60,46.8,0.962,"(30,40]"
51 | "No",1,90,62,12,27.2,0.58,"(20,30]"
52 | "No",3,124,80,33,33.2,0.305,"(20,30]"
53 | "No",5,139,64,35,28.6,0.411,"(20,30]"
54 | 


--------------------------------------------------------------------------------
/models/linear-model-glm.pmml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <PMML version="4.3" xmlns="http://www.dmg.org/PMML-4_3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.dmg.org/PMML-4_3 http://www.dmg.org/pmml/v4-3/pmml-4-3.xsd">
 3 |  <Header copyright="Copyright (c) 2019 dennis" description="Generalized Linear Regression Model">
 4 |   <Extension name="user" value="dennis" extender="SoftwareAG PMML Generator"/>
 5 |   <Application name="SoftwareAG PMML Generator" version="1.4"/>
 6 |   <Timestamp>2019-07-04 16:20:00</Timestamp>
 7 |  </Header>
 8 |  <DataDictionary numberOfFields="8">
 9 |   <DataField name="type" optype="continuous" dataType="double"/>
10 |   <DataField name="npreg" optype="continuous" dataType="double"/>
11 |   <DataField name="glu" optype="continuous" dataType="double"/>
12 |   <DataField name="bp" optype="continuous" dataType="double"/>
13 |   <DataField name="skin" optype="continuous" dataType="double"/>
14 |   <DataField name="bmi" optype="continuous" dataType="double"/>
15 |   <DataField name="ped" optype="continuous" dataType="double"/>
16 |   <DataField name="age" optype="categorical" dataType="string">
17 |    <Value value="(20,30]"/>
18 |    <Value value="(30,40]"/>
19 |    <Value value="(40,50]"/>
20 |    <Value value="(50,60]"/>
21 |    <Value value="(60,70]"/>
22 |   </DataField>
23 |  </DataDictionary>
24 |  <GeneralRegressionModel modelName="General_Regression_Model" modelType="generalizedLinear" functionName="regression" algorithmName="glm" distribution="normal" linkFunction="identity">
25 |   <MiningSchema>
26 |    <MiningField name="type" usageType="predicted" invalidValueTreatment="returnInvalid"/>
27 |    <MiningField name="npreg" usageType="active" invalidValueTreatment="returnInvalid"/>
28 |    <MiningField name="glu" usageType="active" invalidValueTreatment="returnInvalid"/>
29 |    <MiningField name="bp" usageType="active" invalidValueTreatment="returnInvalid"/>
30 |    <MiningField name="skin" usageType="active" invalidValueTreatment="returnInvalid"/>
31 |    <MiningField name="bmi" usageType="active" invalidValueTreatment="returnInvalid"/>
32 |    <MiningField name="ped" usageType="active" invalidValueTreatment="returnInvalid"/>
33 |    <MiningField name="age" usageType="active" invalidValueTreatment="returnInvalid"/>
34 |   </MiningSchema>
35 |   <Output>
36 |    <OutputField name="Predicted_type" feature="predictedValue" optype="continuous" dataType="double"/>
37 |   </Output>
38 |   <ParameterList>
39 |    <Parameter name="p0" label="(Intercept)"/>
40 |    <Parameter name="p1" label="npreg"/>
41 |    <Parameter name="p2" label="glu"/>
42 |    <Parameter name="p3" label="bp"/>
43 |    <Parameter name="p4" label="skin"/>
44 |    <Parameter name="p5" label="bmi"/>
45 |    <Parameter name="p6" label="ped"/>
46 |    <Parameter name="p7" label="age(30,40]"/>
47 |    <Parameter name="p8" label="age(40,50]"/>
48 |    <Parameter name="p9" label="age(50,60]"/>
49 |    <Parameter name="p10" label="age(60,70]"/>
50 |   </ParameterList>
51 |   <FactorList>
52 |    <Predictor name="age"/>
53 |   </FactorList>
54 |   <CovariateList>
55 |    <Predictor name="npreg"/>
56 |    <Predictor name="glu"/>
57 |    <Predictor name="bp"/>
58 |    <Predictor name="skin"/>
59 |    <Predictor name="bmi"/>
60 |    <Predictor name="ped"/>
61 |   </CovariateList>
62 |   <PPMatrix>
63 |    <PPCell value="1" predictorName="npreg" parameterName="p1"/>
64 |    <PPCell value="1" predictorName="glu" parameterName="p2"/>
65 |    <PPCell value="1" predictorName="bp" parameterName="p3"/>
66 |    <PPCell value="1" predictorName="skin" parameterName="p4"/>
67 |    <PPCell value="1" predictorName="bmi" parameterName="p5"/>
68 |    <PPCell value="1" predictorName="ped" parameterName="p6"/>
69 |    <PPCell value="(30,40]" predictorName="age" parameterName="p7"/>
70 |    <PPCell value="(40,50]" predictorName="age" parameterName="p8"/>
71 |    <PPCell value="(50,60]" predictorName="age" parameterName="p9"/>
72 |    <PPCell value="(60,70]" predictorName="age" parameterName="p10"/>
73 |   </PPMatrix>
74 |   <ParamMatrix>
75 |    <PCell parameterName="p0" df="1" beta="-0.839478621884241"/>
76 |    <PCell parameterName="p1" df="1" beta="0.0220430321140947"/>
77 |    <PCell parameterName="p2" df="1" beta="0.00581483594573415"/>
78 |    <PCell parameterName="p3" df="1" beta="-0.000484455369515896"/>
79 |    <PCell parameterName="p4" df="1" beta="-0.00269984592131824"/>
80 |    <PCell parameterName="p5" df="1" beta="0.0160031192916965"/>
81 |    <PCell parameterName="p6" df="1" beta="0.12212526350041"/>
82 |    <PCell parameterName="p7" df="1" beta="0.116206124897975"/>
83 |    <PCell parameterName="p8" df="1" beta="0.161932567707025"/>
84 |    <PCell parameterName="p9" df="1" beta="0.256078472906968"/>
85 |    <PCell parameterName="p10" df="1" beta="-0.50834514801647"/>
86 |   </ParamMatrix>
87 |  </GeneralRegressionModel>
88 | </PMML>
89 | 


--------------------------------------------------------------------------------
/models/linear-model-lasso.pmml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <PMML version="4.3" xmlns="http://www.dmg.org/PMML-4_3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.dmg.org/PMML-4_3 http://www.dmg.org/pmml/v4-3/pmml-4-3.xsd">
 3 |  <Header copyright="Copyright (c) 2019 dennis" description="Generalized Linear Regression Model">
 4 |   <Extension name="user" value="dennis" extender="SoftwareAG PMML Generator"/>
 5 |   <Application name="SoftwareAG PMML Generator" version="1.4"/>
 6 |   <Timestamp>2019-07-05 15:11:34</Timestamp>
 7 |  </Header>
 8 |  <DataDictionary numberOfFields="11">
 9 |   <DataField name="predictedScore" optype="continuous" dataType="double"/>
10 |   <DataField name="npreg" optype="continuous" dataType="double"/>
11 |   <DataField name="glu" optype="continuous" dataType="double"/>
12 |   <DataField name="bp" optype="continuous" dataType="double"/>
13 |   <DataField name="skin" optype="continuous" dataType="double"/>
14 |   <DataField name="bmi" optype="continuous" dataType="double"/>
15 |   <DataField name="ped" optype="continuous" dataType="double"/>
16 |   <DataField name="age(30,40]" optype="continuous" dataType="double"/>
17 |   <DataField name="age(40,50]" optype="continuous" dataType="double"/>
18 |   <DataField name="age(50,60]" optype="continuous" dataType="double"/>
19 |   <DataField name="age(60,70]" optype="continuous" dataType="double"/>
20 |  </DataDictionary>
21 |  <GeneralRegressionModel modelName="Elasticnet_Model" modelType="generalLinear" algorithmName="glmnet" functionName="regression">
22 |   <Extension name="lambda" value="0.0791801758867202"/>
23 |   <MiningSchema>
24 |    <MiningField name="predictedScore" usageType="predicted" invalidValueTreatment="returnInvalid"/>
25 |    <MiningField name="npreg" usageType="active" invalidValueTreatment="returnInvalid"/>
26 |    <MiningField name="glu" usageType="active" invalidValueTreatment="returnInvalid"/>
27 |    <MiningField name="bp" usageType="active" invalidValueTreatment="returnInvalid"/>
28 |    <MiningField name="skin" usageType="active" invalidValueTreatment="returnInvalid"/>
29 |    <MiningField name="bmi" usageType="active" invalidValueTreatment="returnInvalid"/>
30 |    <MiningField name="ped" usageType="active" invalidValueTreatment="returnInvalid"/>
31 |    <MiningField name="age(30,40]" usageType="active" invalidValueTreatment="returnInvalid"/>
32 |    <MiningField name="age(40,50]" usageType="active" invalidValueTreatment="returnInvalid"/>
33 |    <MiningField name="age(50,60]" usageType="active" invalidValueTreatment="returnInvalid"/>
34 |    <MiningField name="age(60,70]" usageType="active" invalidValueTreatment="returnInvalid"/>
35 |   </MiningSchema>
36 |   <Output>
37 |    <OutputField name="predictedValue" feature="predictedValue" dataType="double" optype="continuous"/>
38 |   </Output>
39 |   <ParameterList>
40 |    <Parameter name="p0" label="Intercept"/>
41 |    <Parameter name="p1" label="npreg"/>
42 |    <Parameter name="p2" label="glu"/>
43 |    <Parameter name="p3" label="bp"/>
44 |    <Parameter name="p4" label="skin"/>
45 |    <Parameter name="p5" label="bmi"/>
46 |    <Parameter name="p6" label="ped"/>
47 |    <Parameter name="p7" label="age(30,40]"/>
48 |    <Parameter name="p8" label="age(40,50]"/>
49 |    <Parameter name="p9" label="age(50,60]"/>
50 |    <Parameter name="p10" label="age(60,70]"/>
51 |   </ParameterList>
52 |   <CovariateList>
53 |    <Predictor name="npreg"/>
54 |    <Predictor name="glu"/>
55 |    <Predictor name="bp"/>
56 |    <Predictor name="skin"/>
57 |    <Predictor name="bmi"/>
58 |    <Predictor name="ped"/>
59 |    <Predictor name="age(30,40]"/>
60 |    <Predictor name="age(40,50]"/>
61 |    <Predictor name="age(50,60]"/>
62 |    <Predictor name="age(60,70]"/>
63 |   </CovariateList>
64 |   <PPMatrix>
65 |    <PPCell value="1" predictorName="npreg" parameterName="p1"/>
66 |    <PPCell value="1" predictorName="glu" parameterName="p2"/>
67 |    <PPCell value="1" predictorName="bp" parameterName="p3"/>
68 |    <PPCell value="1" predictorName="skin" parameterName="p4"/>
69 |    <PPCell value="1" predictorName="bmi" parameterName="p5"/>
70 |    <PPCell value="1" predictorName="ped" parameterName="p6"/>
71 |    <PPCell value="1" predictorName="age(30,40]" parameterName="p7"/>
72 |    <PPCell value="1" predictorName="age(40,50]" parameterName="p8"/>
73 |    <PPCell value="1" predictorName="age(50,60]" parameterName="p9"/>
74 |    <PPCell value="1" predictorName="age(60,70]" parameterName="p10"/>
75 |   </PPMatrix>
76 |   <ParamMatrix>
77 |    <PCell parameterName="p0" df="1" beta="-0.244969797557248"/>
78 |    <PCell parameterName="p1" df="1" beta="0.00714807260509402"/>
79 |    <PCell parameterName="p2" df="1" beta="0.0043825055093884"/>
80 |    <PCell parameterName="p5" df="1" beta="0.00501895891755815"/>
81 |   </ParamMatrix>
82 |  </GeneralRegressionModel>
83 | </PMML>
84 | 


--------------------------------------------------------------------------------
/models/linear-model-lm.pmml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <PMML version="4.3" xmlns="http://www.dmg.org/PMML-4_3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.dmg.org/PMML-4_3 http://www.dmg.org/pmml/v4-3/pmml-4-3.xsd">
 3 |     <Header copyright="Copyright (c) 2019 dennis" description="Linear Regression Model">
 4 |         <Extension name="user" value="dennis" extender="SoftwareAG PMML Generator"/>
 5 |         <Application name="SoftwareAG PMML Generator" version="1.4"/>
 6 |         <Timestamp>2019-07-02 14:58:43</Timestamp>
 7 |     </Header>
 8 |     <DataDictionary numberOfFields="8">
 9 |         <DataField name="type" optype="continuous" dataType="double"/>
10 |         <DataField name="npreg" optype="continuous" dataType="double"/>
11 |         <DataField name="glu" optype="continuous" dataType="double"/>
12 |         <DataField name="bp" optype="continuous" dataType="double"/>
13 |         <DataField name="skin" optype="continuous" dataType="double"/>
14 |         <DataField name="bmi" optype="continuous" dataType="double"/>
15 |         <DataField name="ped" optype="continuous" dataType="double"/>
16 |         <DataField name="age" optype="categorical" dataType="string">
17 |             <Value value="(20,30]"/>
18 |             <Value value="(30,40]"/>
19 |             <Value value="(40,50]"/>
20 |             <Value value="(50,60]"/>
21 |             <Value value="(60,70]"/>
22 |         </DataField>
23 |     </DataDictionary>
24 |     <RegressionModel modelName="lm_Model" functionName="regression" algorithmName="least squares">
25 |         <MiningSchema>
26 |             <MiningField name="type" usageType="predicted" invalidValueTreatment="returnInvalid"/>
27 |             <MiningField name="npreg" usageType="active" invalidValueTreatment="returnInvalid"/>
28 |             <MiningField name="glu" usageType="active" invalidValueTreatment="returnInvalid"/>
29 |             <MiningField name="bp" usageType="active" invalidValueTreatment="returnInvalid"/>
30 |             <MiningField name="skin" usageType="active" invalidValueTreatment="returnInvalid"/>
31 |             <MiningField name="bmi" usageType="active" invalidValueTreatment="returnInvalid"/>
32 |             <MiningField name="ped" usageType="active" invalidValueTreatment="returnInvalid"/>
33 |             <MiningField name="age" usageType="active" invalidValueTreatment="returnInvalid"/>
34 |         </MiningSchema>
35 |         <Output>
36 |             <OutputField name="Predicted_type" optype="continuous" dataType="double" feature="predictedValue"/>
37 |         </Output>
38 |         <RegressionTable intercept="-1.184531984528004900">
39 |             <NumericPredictor name="npreg" exponent="1" coefficient="0.00651740153021247"/>
40 |             <NumericPredictor name="glu" exponent="1" coefficient="0.00671919435807884"/>
41 |             <NumericPredictor name="bp" exponent="1" coefficient="0.00249154692205562"/>
42 |             <NumericPredictor name="skin" exponent="1" coefficient="0.00919990640160907"/>
43 |             <NumericPredictor name="bmi" exponent="1" coefficient="0.00512551654170648"/>
44 |             <NumericPredictor name="ped" exponent="1" coefficient="0.242665168026099"/>
45 |             <CategoricalPredictor name="age" value="(30,40]" coefficient="0.216793735849267"/>
46 |             <CategoricalPredictor name="age" value="(40,50]" coefficient="0.20974531023994"/>
47 |             <CategoricalPredictor name="age" value="(50,60]" coefficient="0.015992579519212"/>
48 |             <CategoricalPredictor name="age" value="(60,70]" coefficient="-0.0128558877489834"/>
49 |         </RegressionTable>
50 |     </RegressionModel>
51 | </PMML>
52 | 


--------------------------------------------------------------------------------
/models/linear-model-lmc.pmml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
 2 | <PMML xmlns="http://www.dmg.org/PMML-4_4" xmlns:data="http://jpmml.org/jpmml-model/InlineTable" version="4.4">
 3 |     <Header>
 4 |         <Application name="JPMML-R" version="1.4.4"/>
 5 |         <Timestamp>2021-05-26T11:59:49Z</Timestamp>
 6 |     </Header>
 7 |     <DataDictionary>
 8 |         <DataField name="_target" optype="categorical" dataType="string">
 9 |             <Value value="No"/>
10 |             <Value value="Yes"/>
11 |         </DataField>
12 |         <DataField name="npreg" optype="continuous" dataType="double"/>
13 |         <DataField name="glu" optype="continuous" dataType="double"/>
14 |         <DataField name="bp" optype="continuous" dataType="double"/>
15 |         <DataField name="skin" optype="continuous" dataType="double"/>
16 |         <DataField name="bmi" optype="continuous" dataType="double"/>
17 |         <DataField name="ped" optype="continuous" dataType="double"/>
18 |         <DataField name="age(30,40]" optype="continuous" dataType="double"/>
19 |         <DataField name="age(40,50]" optype="continuous" dataType="double"/>
20 |         <DataField name="age(50,60]" optype="continuous" dataType="double"/>
21 |         <DataField name="age(60,70]" optype="continuous" dataType="double"/>
22 |     </DataDictionary>
23 |     <RegressionModel functionName="classification" normalizationMethod="logit">
24 |         <MiningSchema>
25 |             <MiningField name="_target" usageType="target"/>
26 |             <MiningField name="npreg"/>
27 |             <MiningField name="glu"/>
28 |             <MiningField name="bp"/>
29 |             <MiningField name="skin"/>
30 |             <MiningField name="bmi"/>
31 |             <MiningField name="ped"/>
32 |             <MiningField name="age(30,40]"/>
33 |             <MiningField name="age(40,50]"/>
34 |             <MiningField name="age(50,60]"/>
35 |             <MiningField name="age(60,70]"/>
36 |         </MiningSchema>
37 |         <Output>
38 |             <OutputField name="probability(No)" optype="continuous" dataType="double" feature="probability" value="No"/>
39 |             <OutputField name="probability(Yes)" optype="continuous" dataType="double" feature="probability" value="Yes"/>
40 |         </Output>
41 |         <RegressionTable intercept="-5.813433672443972" targetCategory="Yes">
42 |             <NumericPredictor name="npreg" coefficient="0.08650789309856853"/>
43 |             <NumericPredictor name="glu" coefficient="0.029845161208353056"/>
44 |             <NumericPredictor name="bp" coefficient="0.01771566658062681"/>
45 |             <NumericPredictor name="skin" coefficient="0"/>
46 |             <NumericPredictor name="bmi" coefficient="0"/>
47 |             <NumericPredictor name="ped" coefficient="0.7454561651044069"/>
48 |             <NumericPredictor name="age(30,40]" coefficient="0"/>
49 |             <NumericPredictor name="age(40,50]" coefficient="0.010730099297261153"/>
50 |             <NumericPredictor name="age(50,60]" coefficient="0"/>
51 |             <NumericPredictor name="age(60,70]" coefficient="-0.12650546294905132"/>
52 |         </RegressionTable>
53 |         <RegressionTable intercept="0.0" targetCategory="No"/>
54 |     </RegressionModel>
55 | </PMML>
56 | 


--------------------------------------------------------------------------------
/models/linear-model-ridge.pmml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <PMML version="4.3" xmlns="http://www.dmg.org/PMML-4_3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.dmg.org/PMML-4_3 http://www.dmg.org/pmml/v4-3/pmml-4-3.xsd">
 3 |  <Header copyright="Copyright (c) 2019 dennis" description="Generalized Linear Regression Model">
 4 |   <Extension name="user" value="dennis" extender="SoftwareAG PMML Generator"/>
 5 |   <Application name="SoftwareAG PMML Generator" version="1.4"/>
 6 |   <Timestamp>2019-07-05 14:48:04</Timestamp>
 7 |  </Header>
 8 |  <DataDictionary numberOfFields="11">
 9 |   <DataField name="predictedScore" optype="continuous" dataType="double"/>
10 |   <DataField name="npreg" optype="continuous" dataType="double"/>
11 |   <DataField name="glu" optype="continuous" dataType="double"/>
12 |   <DataField name="bp" optype="continuous" dataType="double"/>
13 |   <DataField name="skin" optype="continuous" dataType="double"/>
14 |   <DataField name="bmi" optype="continuous" dataType="double"/>
15 |   <DataField name="ped" optype="continuous" dataType="double"/>
16 |   <DataField name="age(30,40]" optype="continuous" dataType="double"/>
17 |   <DataField name="age(40,50]" optype="continuous" dataType="double"/>
18 |   <DataField name="age(50,60]" optype="continuous" dataType="double"/>
19 |   <DataField name="age(60,70]" optype="continuous" dataType="double"/>
20 |  </DataDictionary>
21 |  <GeneralRegressionModel modelName="Elasticnet_Model" modelType="generalLinear" algorithmName="glmnet" functionName="regression">
22 |   <Extension name="lambda" value="0.571744930745718"/>
23 |   <MiningSchema>
24 |    <MiningField name="predictedScore" usageType="predicted" invalidValueTreatment="returnInvalid"/>
25 |    <MiningField name="npreg" usageType="active" invalidValueTreatment="returnInvalid"/>
26 |    <MiningField name="glu" usageType="active" invalidValueTreatment="returnInvalid"/>
27 |    <MiningField name="bp" usageType="active" invalidValueTreatment="returnInvalid"/>
28 |    <MiningField name="skin" usageType="active" invalidValueTreatment="returnInvalid"/>
29 |    <MiningField name="bmi" usageType="active" invalidValueTreatment="returnInvalid"/>
30 |    <MiningField name="ped" usageType="active" invalidValueTreatment="returnInvalid"/>
31 |    <MiningField name="age(30,40]" usageType="active" invalidValueTreatment="returnInvalid"/>
32 |    <MiningField name="age(40,50]" usageType="active" invalidValueTreatment="returnInvalid"/>
33 |    <MiningField name="age(50,60]" usageType="active" invalidValueTreatment="returnInvalid"/>
34 |    <MiningField name="age(60,70]" usageType="active" invalidValueTreatment="returnInvalid"/>
35 |   </MiningSchema>
36 |   <Output>
37 |    <OutputField name="predictedValue" feature="predictedValue" dataType="double" optype="continuous"/>
38 |   </Output>
39 |   <ParameterList>
40 |    <Parameter name="p0" label="Intercept"/>
41 |    <Parameter name="p1" label="npreg"/>
42 |    <Parameter name="p2" label="glu"/>
43 |    <Parameter name="p3" label="bp"/>
44 |    <Parameter name="p4" label="skin"/>
45 |    <Parameter name="p5" label="bmi"/>
46 |    <Parameter name="p6" label="ped"/>
47 |    <Parameter name="p7" label="age(30,40]"/>
48 |    <Parameter name="p8" label="age(40,50]"/>
49 |    <Parameter name="p9" label="age(50,60]"/>
50 |    <Parameter name="p10" label="age(60,70]"/>
51 |   </ParameterList>
52 |   <CovariateList>
53 |    <Predictor name="npreg"/>
54 |    <Predictor name="glu"/>
55 |    <Predictor name="bp"/>
56 |    <Predictor name="skin"/>
57 |    <Predictor name="bmi"/>
58 |    <Predictor name="ped"/>
59 |    <Predictor name="age(30,40]"/>
60 |    <Predictor name="age(40,50]"/>
61 |    <Predictor name="age(50,60]"/>
62 |    <Predictor name="age(60,70]"/>
63 |   </CovariateList>
64 |   <PPMatrix>
65 |    <PPCell value="1" predictorName="npreg" parameterName="p1"/>
66 |    <PPCell value="1" predictorName="glu" parameterName="p2"/>
67 |    <PPCell value="1" predictorName="skin" parameterName="p4"/>
68 |    <PPCell value="1" predictorName="bmi" parameterName="p5"/>
69 |    <PPCell value="1" predictorName="ped" parameterName="p6"/>
70 |    <PPCell value="1" predictorName="age(30,40]" parameterName="p7"/>
71 |    <PPCell value="1" predictorName="age(40,50]" parameterName="p8"/>
72 |    <PPCell value="1" predictorName="age(50,60]" parameterName="p9"/>
73 |    <PPCell value="1" predictorName="age(60,70]" parameterName="p10"/>
74 |   </PPMatrix>
75 |   <ParamMatrix>
76 |    <PCell parameterName="p0" df="1" beta="-0.319460976201184"/>
77 |    <PCell parameterName="p1" df="1" beta="0.0129328075069777"/>
78 |    <PCell parameterName="p2" df="1" beta="0.00295834056265095"/>
79 |    <PCell parameterName="p3" df="1" beta="0.0"/>
80 |    <PCell parameterName="p4" df="1" beta="0.00113327704693355"/>
81 |    <PCell parameterName="p5" df="1" beta="0.00680542868088701"/>
82 |    <PCell parameterName="p6" df="1" beta="0.0668511302874908"/>
83 |    <PCell parameterName="p7" df="1" beta="0.0635682055588492"/>
84 |    <PCell parameterName="p8" df="1" beta="0.0753023562973137"/>
85 |    <PCell parameterName="p9" df="1" beta="0.15176667940367"/>
86 |    <PCell parameterName="p10" df="1" beta="-0.230105470284087"/>
87 |   </ParamMatrix>
88 |  </GeneralRegressionModel>
89 | </PMML>
90 | 


--------------------------------------------------------------------------------
/models/linear-model-ridgec.pmml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <PMML version="4.4.1" xmlns="http://www.dmg.org/PMML-4_4" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.dmg.org/PMML-4_4 http://www.dmg.org/pmml/v4-4/pmml-4-4.xsd">
 3 |  <Header copyright="Copyright (c) 2021 decode" description="Generalized Linear Regression Model">
 4 |   <Extension name="user" value="decode" extender="SoftwareAG PMML Generator"/>
 5 |   <Application name="SoftwareAG PMML Generator" version="2.4.0"/>
 6 |   <Timestamp>2021-05-21 21:56:51</Timestamp>
 7 |  </Header>
 8 |  <DataDictionary numberOfFields="8">
 9 |   <DataField name="type" optype="categorical" dataType="string">
10 |    <Value value="No"/>
11 |    <Value value="Yes"/>
12 |   </DataField>
13 |   <DataField name="npreg" optype="continuous" dataType="double"/>
14 |   <DataField name="glu" optype="continuous" dataType="double"/>
15 |   <DataField name="bp" optype="continuous" dataType="double"/>
16 |   <DataField name="skin" optype="continuous" dataType="double"/>
17 |   <DataField name="bmi" optype="continuous" dataType="double"/>
18 |   <DataField name="ped" optype="continuous" dataType="double"/>
19 |   <DataField name="age" optype="categorical" dataType="string">
20 |    <Value value="(20,30]"/>
21 |    <Value value="(30,40]"/>
22 |    <Value value="(40,50]"/>
23 |    <Value value="(50,60]"/>
24 |    <Value value="(60,70]"/>
25 |   </DataField>
26 |  </DataDictionary>
27 |  <GeneralRegressionModel modelName="General_Regression_Model" modelType="generalizedLinear" functionName="classification" algorithmName="glm" distribution="binomial" linkFunction="logit">
28 |   <MiningSchema>
29 |    <MiningField name="type" usageType="predicted" invalidValueTreatment="returnInvalid"/>
30 |    <MiningField name="npreg" usageType="active" invalidValueTreatment="returnInvalid"/>
31 |    <MiningField name="glu" usageType="active" invalidValueTreatment="returnInvalid"/>
32 |    <MiningField name="bp" usageType="active" invalidValueTreatment="returnInvalid"/>
33 |    <MiningField name="skin" usageType="active" invalidValueTreatment="returnInvalid"/>
34 |    <MiningField name="bmi" usageType="active" invalidValueTreatment="returnInvalid"/>
35 |    <MiningField name="ped" usageType="active" invalidValueTreatment="returnInvalid"/>
36 |    <MiningField name="age" usageType="active" invalidValueTreatment="returnInvalid"/>
37 |   </MiningSchema>
38 |   <Output>
39 |    <OutputField name="Probability_Yes" targetField="type" feature="probability" value="Yes" optype="continuous" dataType="double"/>
40 |    <OutputField name="Predicted_type" feature="predictedValue" optype="categorical" dataType="string"/>
41 |   </Output>
42 |   <ParameterList>
43 |    <Parameter name="p0" label="(Intercept)"/>
44 |    <Parameter name="p1" label="npreg"/>
45 |    <Parameter name="p2" label="glu"/>
46 |    <Parameter name="p3" label="bp"/>
47 |    <Parameter name="p4" label="skin"/>
48 |    <Parameter name="p5" label="bmi"/>
49 |    <Parameter name="p6" label="ped"/>
50 |    <Parameter name="p7" label="age(30,40]"/>
51 |    <Parameter name="p8" label="age(40,50]"/>
52 |    <Parameter name="p9" label="age(50,60]"/>
53 |    <Parameter name="p10" label="age(60,70]"/>
54 |   </ParameterList>
55 |   <FactorList>
56 |    <Predictor name="age"/>
57 |   </FactorList>
58 |   <CovariateList>
59 |    <Predictor name="npreg"/>
60 |    <Predictor name="glu"/>
61 |    <Predictor name="bp"/>
62 |    <Predictor name="skin"/>
63 |    <Predictor name="bmi"/>
64 |    <Predictor name="ped"/>
65 |   </CovariateList>
66 |   <PPMatrix>
67 |    <PPCell value="1" predictorName="npreg" parameterName="p1"/>
68 |    <PPCell value="1" predictorName="glu" parameterName="p2"/>
69 |    <PPCell value="1" predictorName="bp" parameterName="p3"/>
70 |    <PPCell value="1" predictorName="skin" parameterName="p4"/>
71 |    <PPCell value="1" predictorName="bmi" parameterName="p5"/>
72 |    <PPCell value="1" predictorName="ped" parameterName="p6"/>
73 |    <PPCell value="(30,40]" predictorName="age" parameterName="p7"/>
74 |    <PPCell value="(40,50]" predictorName="age" parameterName="p8"/>
75 |    <PPCell value="(50,60]" predictorName="age" parameterName="p9"/>
76 |    <PPCell value="(60,70]" predictorName="age" parameterName="p10"/>
77 |   </PPMatrix>
78 |   <ParamMatrix>
79 |    <PCell targetCategory="Yes" parameterName="p0" df="1" beta="-57.1799981494652"/>
80 |    <PCell targetCategory="Yes" parameterName="p1" df="1" beta="0.722654058424025"/>
81 |    <PCell targetCategory="Yes" parameterName="p2" df="1" beta="0.170651218810002"/>
82 |    <PCell targetCategory="Yes" parameterName="p3" df="1" beta="0.455725762363011"/>
83 |    <PCell targetCategory="Yes" parameterName="p4" df="1" beta="-0.473218748281948"/>
84 |    <PCell targetCategory="Yes" parameterName="p5" df="1" beta="0.275493428386101"/>
85 |    <PCell targetCategory="Yes" parameterName="p6" df="1" beta="7.40623923752118"/>
86 |    <PCell targetCategory="Yes" parameterName="p7" df="1" beta="5.6829407356491"/>
87 |    <PCell targetCategory="Yes" parameterName="p8" df="1" beta="8.82062257424644"/>
88 |    <PCell targetCategory="Yes" parameterName="p9" df="1" beta="-4.44588099376691"/>
89 |    <PCell targetCategory="Yes" parameterName="p10" df="1" beta="-26.4273990722638"/>
90 |   </ParamMatrix>
91 |  </GeneralRegressionModel>
92 | </PMML>
93 | 


--------------------------------------------------------------------------------
/models/nb-cat-pima.pmml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0"?>
  2 | <PMML version="4.3" xmlns="http://www.dmg.org/PMML-4_3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.dmg.org/PMML-4_3 http://www.dmg.org/pmml/v4-3/pmml-4-3.xsd">
  3 |     <Header copyright="Copyright (c) 2020 dennis" description="NaiveBayes Model">
  4 |         <Extension name="user" value="dennis" extender="SoftwareAG PMML Generator"/>
  5 |         <Application name="SoftwareAG PMML Generator" version="1.4"/>
  6 |         <Timestamp>2020-01-28 17:45:24</Timestamp>
  7 |     </Header>
  8 |     <DataDictionary numberOfFields="8">
  9 |         <DataField name="type" optype="categorical" dataType="string">
 10 |             <Value value="No"/>
 11 |             <Value value="Yes"/>
 12 |         </DataField>
 13 |         <DataField name="npreg" optype="continuous" dataType="double"/>
 14 |         <DataField name="glu" optype="continuous" dataType="double"/>
 15 |         <DataField name="bp" optype="continuous" dataType="double"/>
 16 |         <DataField name="skin" optype="continuous" dataType="double"/>
 17 |         <DataField name="bmi" optype="continuous" dataType="double"/>
 18 |         <DataField name="ped" optype="continuous" dataType="double"/>
 19 |         <DataField name="age" optype="categorical" dataType="string">
 20 |             <Value value="(20,30]"/>
 21 |             <Value value="(30,40]"/>
 22 |             <Value value="(40,50]"/>
 23 |             <Value value="(50,60]"/>
 24 |             <Value value="(60,70]"/>
 25 |         </DataField>
 26 |     </DataDictionary>
 27 |     <NaiveBayesModel modelName="naiveBayes_Model" functionName="classification" threshold="0.001">
 28 |         <MiningSchema>
 29 |             <MiningField name="type" usageType="predicted" invalidValueTreatment="returnInvalid"/>
 30 |             <MiningField name="npreg" usageType="active" invalidValueTreatment="returnInvalid"/>
 31 |             <MiningField name="glu" usageType="active" invalidValueTreatment="returnInvalid"/>
 32 |             <MiningField name="bp" usageType="active" invalidValueTreatment="returnInvalid"/>
 33 |             <MiningField name="skin" usageType="active" invalidValueTreatment="returnInvalid"/>
 34 |             <MiningField name="bmi" usageType="active" invalidValueTreatment="returnInvalid"/>
 35 |             <MiningField name="ped" usageType="active" invalidValueTreatment="returnInvalid"/>
 36 |             <MiningField name="age" usageType="active" invalidValueTreatment="returnInvalid"/>
 37 |         </MiningSchema>
 38 |         <Output>
 39 |             <OutputField name="Predicted_type" optype="categorical" dataType="string" feature="predictedValue"/>
 40 |             <OutputField name="Probability_No" optype="continuous" dataType="double" feature="probability" value="No"/>
 41 |             <OutputField name="Probability_Yes" optype="continuous" dataType="double" feature="probability" value="Yes"/>
 42 |         </Output>
 43 |         <BayesInputs>
 44 |             <BayesInput fieldName="npreg">
 45 |                 <TargetValueStats>
 46 |                     <TargetValueStat value="No">
 47 |                         <GaussianDistribution mean="2.80188679245283" variance="7.26513926325247"/>
 48 |                     </TargetValueStat>
 49 |                     <TargetValueStat value="Yes">
 50 |                         <GaussianDistribution mean="4.79245283018868" variance="13.7279424977538"/>
 51 |                     </TargetValueStat>
 52 |                 </TargetValueStats>
 53 |             </BayesInput>
 54 |             <BayesInput fieldName="glu">
 55 |                 <TargetValueStats>
 56 |                     <TargetValueStat value="No">
 57 |                         <GaussianDistribution mean="111.990566037736" variance="723.533243486074"/>
 58 |                     </TargetValueStat>
 59 |                     <TargetValueStat value="Yes">
 60 |                         <GaussianDistribution mean="143.367924528302" variance="824.291913746631"/>
 61 |                     </TargetValueStat>
 62 |                 </TargetValueStats>
 63 |             </BayesInput>
 64 |             <BayesInput fieldName="bp">
 65 |                 <TargetValueStats>
 66 |                     <TargetValueStat value="No">
 67 |                         <GaussianDistribution mean="69.7452830188679" variance="121.829739442947"/>
 68 |                     </TargetValueStat>
 69 |                     <TargetValueStat value="Yes">
 70 |                         <GaussianDistribution mean="75.2551020408163" variance="139.22291184515"/>
 71 |                     </TargetValueStat>
 72 |                 </TargetValueStats>
 73 |             </BayesInput>
 74 |             <BayesInput fieldName="skin">
 75 |                 <TargetValueStats>
 76 |                     <TargetValueStat value="No">
 77 |                         <GaussianDistribution mean="26.6792452830189" variance="121.61042228212"/>
 78 |                     </TargetValueStat>
 79 |                     <TargetValueStat value="Yes">
 80 |                         <GaussianDistribution mean="33.1176470588235" variance="151.329236172081"/>
 81 |                     </TargetValueStat>
 82 |                 </TargetValueStats>
 83 |             </BayesInput>
 84 |             <BayesInput fieldName="bmi">
 85 |                 <TargetValueStats>
 86 |                     <TargetValueStat value="No">
 87 |                         <GaussianDistribution mean="30.8235849056604" variance="44.6014384546271"/>
 88 |                     </TargetValueStat>
 89 |                     <TargetValueStat value="Yes">
 90 |                         <GaussianDistribution mean="34.58" variance="31.2998846153846"/>
 91 |                     </TargetValueStat>
 92 |                 </TargetValueStats>
 93 |             </BayesInput>
 94 |             <BayesInput fieldName="ped">
 95 |                 <TargetValueStats>
 96 |                     <TargetValueStat value="No">
 97 |                         <GaussianDistribution mean="0.419103773584906" variance="0.0765463034141959"/>
 98 |                     </TargetValueStat>
 99 |                     <TargetValueStat value="Yes">
100 |                         <GaussianDistribution mean="0.505424528301887" variance="0.11190700853549"/>
101 |                     </TargetValueStat>
102 |                 </TargetValueStats>
103 |             </BayesInput>
104 |             <BayesInput fieldName="age">
105 |                 <PairCounts value="(20,30]">
106 |                     <TargetValueCounts>
107 |                         <TargetValueCount value="No" count="77"/>
108 |                         <TargetValueCount value="Yes" count="35"/>
109 |                     </TargetValueCounts>
110 |                 </PairCounts>
111 |                 <PairCounts value="(30,40]">
112 |                     <TargetValueCounts>
113 |                         <TargetValueCount value="No" count="15"/>
114 |                         <TargetValueCount value="Yes" count="32"/>
115 |                     </TargetValueCounts>
116 |                 </PairCounts>
117 |                 <PairCounts value="(40,50]">
118 |                     <TargetValueCounts>
119 |                         <TargetValueCount value="No" count="8"/>
120 |                         <TargetValueCount value="Yes" count="23"/>
121 |                     </TargetValueCounts>
122 |                 </PairCounts>
123 |                 <PairCounts value="(50,60]">
124 |                     <TargetValueCounts>
125 |                         <TargetValueCount value="No" count="4"/>
126 |                         <TargetValueCount value="Yes" count="14"/>
127 |                     </TargetValueCounts>
128 |                 </PairCounts>
129 |                 <PairCounts value="(60,70]">
130 |                     <TargetValueCounts>
131 |                         <TargetValueCount value="No" count="2"/>
132 |                         <TargetValueCount value="Yes" count="2"/>
133 |                     </TargetValueCounts>
134 |                 </PairCounts>
135 |             </BayesInput>
136 |         </BayesInputs>
137 |     </NaiveBayesModel>
138 | </PMML>
139 | 


--------------------------------------------------------------------------------
/models/nn-iris.pmml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0"?>
  2 | <PMML version="4.4.1" xmlns="http://www.dmg.org/PMML-4_4" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.dmg.org/PMML-4_4 http://www.dmg.org/pmml/v4-4/pmml-4-4.xsd">
  3 |  <Header copyright="Copyright (c) 2022 decode" description="Neural Network Model">
  4 |   <Extension name="user" value="decode" extender="SoftwareAG PMML Generator"/>
  5 |   <Application name="SoftwareAG PMML Generator" version="2.5.1"/>
  6 |   <Timestamp>2022-01-28 11:41:54</Timestamp>
  7 |  </Header>
  8 |  <DataDictionary numberOfFields="5">
  9 |   <DataField name="Class" optype="categorical" dataType="string">
 10 |       <Value value="versicolor"/>
 11 |       <Value value="setosa"/>
 12 |       <Value value="virginica"/>
 13 |   </DataField>
 14 |   <DataField name="sepal length (cm)" optype="continuous" dataType="float"/>
 15 |   <DataField name="sepal width (cm)" optype="continuous" dataType="float"/>
 16 |   <DataField name="petal length (cm)" optype="continuous" dataType="float"/>
 17 |   <DataField name="petal width (cm)" optype="continuous" dataType="float"/>
 18 |  </DataDictionary>
 19 |  <NeuralNetwork modelName="NeuralNet_model" functionName="classification" numberOfLayers="2" activationFunction="logistic">
 20 |   <MiningSchema>
 21 |    <MiningField name="Class" usageType="predicted" invalidValueTreatment="returnInvalid"/>
 22 |    <MiningField name="sepal length (cm)" usageType="active" invalidValueTreatment="returnInvalid"/>
 23 |    <MiningField name="sepal width (cm)" usageType="active" invalidValueTreatment="returnInvalid"/>
 24 |    <MiningField name="petal length (cm)" usageType="active" invalidValueTreatment="returnInvalid"/>
 25 |    <MiningField name="petal width (cm)" usageType="active" invalidValueTreatment="returnInvalid"/>
 26 |   </MiningSchema>
 27 |   <Output>
 28 |    <OutputField name="Predicted_Class" optype="categorical" dataType="string" feature="predictedValue"/>
 29 |    <OutputField name="Probability_versicolor" optype="continuous" dataType="double" feature="probability" value="versicolor"/>
 30 |    <OutputField name="Probability_setosa" optype="continuous" dataType="double" feature="probability" value="setosa"/>
 31 |    <OutputField name="Probability_viginica" optype="continuous" dataType="double" feature="probability" value="virginica"/>
 32 |   </Output>
 33 |   <NeuralInputs numberOfInputs="4">
 34 |    <NeuralInput id="1">
 35 |     <DerivedField name="derivedNI_sepal length (cm)" optype="continuous" dataType="double">
 36 |      <FieldRef field="sepal length (cm)"/>
 37 |     </DerivedField>
 38 |    </NeuralInput>
 39 |    <NeuralInput id="2">
 40 |     <DerivedField name="derivedNI_sepal width (cm)" optype="continuous" dataType="double">
 41 |      <FieldRef field="sepal width (cm)"/>
 42 |     </DerivedField>
 43 |    </NeuralInput>
 44 |    <NeuralInput id="3">
 45 |     <DerivedField name="derivedNI_petal length (cm)" optype="continuous" dataType="double">
 46 |      <FieldRef field="petal length (cm)"/>
 47 |     </DerivedField>
 48 |    </NeuralInput>
 49 |    <NeuralInput id="4">
 50 |     <DerivedField name="derivedNI_petal width (cm)" optype="continuous" dataType="double">
 51 |      <FieldRef field="petal width (cm)"/>
 52 |     </DerivedField>
 53 |    </NeuralInput>
 54 |   </NeuralInputs>
 55 |   <NeuralLayer numberOfNeurons="5">
 56 |    <Neuron id="5" bias="6.48585945924381">
 57 |     <Con from="1" weight="4.54587431828472"/>
 58 |     <Con from="2" weight="5.52146633781706"/>
 59 |     <Con from="3" weight="-9.17023767457325"/>
 60 |     <Con from="4" weight="-3.33030423024173"/>
 61 |    </Neuron>
 62 |    <Neuron id="6" bias="-0.219171178910383">
 63 |     <Con from="1" weight="-0.621178185846089"/>
 64 |     <Con from="2" weight="-0.251461468530296"/>
 65 |     <Con from="3" weight="0.191483166902765"/>
 66 |     <Con from="4" weight="0.110745459632997"/>
 67 |    </Neuron>
 68 |    <Neuron id="7" bias="-0.403390824874627">
 69 |     <Con from="1" weight="-0.75197406561544"/>
 70 |     <Con from="2" weight="-1.83034558198221"/>
 71 |     <Con from="3" weight="3.14589785761204"/>
 72 |     <Con from="4" weight="1.69002269627064"/>
 73 |    </Neuron>
 74 |    <Neuron id="8" bias="-0.503134825404882">
 75 |     <Con from="1" weight="-0.49881585534317"/>
 76 |     <Con from="2" weight="1.59778404626156"/>
 77 |     <Con from="3" weight="-0.0324193085084592"/>
 78 |     <Con from="4" weight="0.0903405534052034"/>
 79 |    </Neuron>
 80 |    <Neuron id="9" bias="-0.195288296527289">
 81 |     <Con from="1" weight="0.641867291385391"/>
 82 |     <Con from="2" weight="0.119223840923296"/>
 83 |     <Con from="3" weight="-0.401058887116727"/>
 84 |     <Con from="4" weight="-0.536453517669621"/>
 85 |    </Neuron>
 86 |   </NeuralLayer>
 87 |   <NeuralLayer numberOfNeurons="3" activationFunction="identity" normalizationMethod="softmax">
 88 |    <Neuron id="10" bias="-3.92364759629226">
 89 |     <Con from="5" weight="9.05216771475619"/>
 90 |     <Con from="6" weight="-0.754661354448251"/>
 91 |     <Con from="7" weight="6.54676332447574"/>
 92 |     <Con from="8" weight="-4.08076405029937"/>
 93 |     <Con from="9" weight="-2.99574486936919"/>
 94 |    </Neuron>
 95 |    <Neuron id="11" bias="1.58442024136309">
 96 |     <Con from="5" weight="2.66383665553495"/>
 97 |     <Con from="6" weight="-1.25438096411401"/>
 98 |     <Con from="7" weight="-8.2424400278915"/>
 99 |     <Con from="8" weight="1.99704085891973"/>
100 |     <Con from="9" weight="0.448610898349359"/>
101 |    </Neuron>
102 |    <Neuron id="12" bias="2.33921725721368">
103 |     <Con from="5" weight="-11.7159867913009"/>
104 |     <Con from="6" weight="2.0090441922176"/>
105 |     <Con from="7" weight="1.69567608217778"/>
106 |     <Con from="8" weight="2.08373122967452"/>
107 |     <Con from="9" weight="2.54713021209121"/>
108 |    </Neuron>
109 |   </NeuralLayer>
110 |   <NeuralOutputs numberOfOutputs="3">
111 |    <NeuralOutput outputNeuron="10">
112 |     <DerivedField name="derivedNO_Class" optype="continuous" dataType="double">
113 |      <NormDiscrete field="Class" value="versicolor"/>
114 |     </DerivedField>
115 |    </NeuralOutput>
116 |    <NeuralOutput outputNeuron="11">
117 |     <DerivedField name="derivedNO_Class" optype="continuous" dataType="double">
118 |      <NormDiscrete field="Class" value="setosa"/>
119 |     </DerivedField>
120 |    </NeuralOutput>
121 |    <NeuralOutput outputNeuron="12">
122 |     <DerivedField name="derivedNO_Class" optype="continuous" dataType="double">
123 |      <NormDiscrete field="Class" value="virginica"/>
124 |     </DerivedField>
125 |    </NeuralOutput>
126 |   </NeuralOutputs>
127 |  </NeuralNetwork>
128 | </PMML>
129 | 


--------------------------------------------------------------------------------
/models/tree-cat-pima-regression.pmml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
 2 | <PMML xmlns="http://www.dmg.org/PMML-4_4" xmlns:data="http://jpmml.org/jpmml-model/InlineTable" version="4.4">
 3 | 	<Header>
 4 | 		<Application name="JPMML-R" version="1.4.4"/>
 5 | 		<Timestamp>2021-06-04T14:28:07Z</Timestamp>
 6 | 	</Header>
 7 | 	<DataDictionary>
 8 | 		<DataField name="type" optype="continuous" dataType="double"/>
 9 | 		<DataField name="npreg" optype="continuous" dataType="double"/>
10 | 		<DataField name="glu" optype="continuous" dataType="double"/>
11 | 		<DataField name="bp" optype="continuous" dataType="double"/>
12 | 		<DataField name="skin" optype="continuous" dataType="double"/>
13 | 		<DataField name="bmi" optype="continuous" dataType="double"/>
14 | 		<DataField name="ped" optype="continuous" dataType="double"/>
15 | 		<DataField name="age" optype="categorical" dataType="string">
16 | 			<Value value="(20,30]"/>
17 | 			<Value value="(30,40]"/>
18 | 			<Value value="(40,50]"/>
19 | 			<Value value="(50,60]"/>
20 | 			<Value value="(60,70]"/>
21 | 		</DataField>
22 | 	</DataDictionary>
23 | 	<TreeModel functionName="regression" noTrueChildStrategy="returnLastPrediction">
24 | 		<MiningSchema>
25 | 			<MiningField name="type" usageType="target"/>
26 | 			<MiningField name="npreg"/>
27 | 			<MiningField name="glu"/>
28 | 			<MiningField name="bp"/>
29 | 			<MiningField name="skin"/>
30 | 			<MiningField name="bmi"/>
31 | 			<MiningField name="ped"/>
32 | 			<MiningField name="age"/>
33 | 		</MiningSchema>
34 | 		<Node id="1" score="0.5" recordCount="52">
35 | 			<True/>
36 | 			<Node id="2" score="0.16666666666666666" recordCount="24">
37 | 				<SimplePredicate field="glu" operator="lessThan" value="124.5"/>
38 | 				<Node id="4" score="0.0" recordCount="10">
39 | 					<SimplePredicate field="skin" operator="greaterOrEqual" value="31.32612218622545"/>
40 | 				</Node>
41 | 				<Node id="5" score="0.2857142857142857" recordCount="14">
42 | 					<SimplePredicate field="skin" operator="lessThan" value="31.32612218622545"/>
43 | 				</Node>
44 | 			</Node>
45 | 			<Node id="3" score="0.7857142857142857" recordCount="28">
46 | 				<SimplePredicate field="glu" operator="greaterOrEqual" value="124.5"/>
47 | 				<Node id="7" score="1.0" recordCount="13">
48 | 					<SimplePredicate field="ped" operator="greaterOrEqual" value="0.425"/>
49 | 				</Node>
50 | 				<Node id="6" score="0.6" recordCount="15">
51 | 					<SimplePredicate field="ped" operator="lessThan" value="0.425"/>
52 | 				</Node>
53 | 			</Node>
54 | 		</Node>
55 | 	</TreeModel>
56 | </PMML>
57 | 


--------------------------------------------------------------------------------
/models/tree-cat-pima.pmml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <PMML version="4.3" xmlns="http://www.dmg.org/PMML-4_3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.dmg.org/PMML-4_3 http://www.dmg.org/pmml/v4-3/pmml-4-3.xsd">
 3 |  <Header copyright="Copyright (c) 2019 dennis" description="RPart Decision Tree Model">
 4 |   <Extension name="user" value="dennis" extender="Rattle/PMML"/>
 5 |   <Application name="Rattle/PMML" version="1.4"/>
 6 |   <Timestamp>2019-02-01 17:43:13</Timestamp>
 7 |  </Header>
 8 |  <DataDictionary numberOfFields="8">
 9 |   <DataField name="type" optype="categorical" dataType="string">
10 |    <Value value="No"/>
11 |    <Value value="Yes"/>
12 |   </DataField>
13 |   <DataField name="npreg" optype="continuous" dataType="double"/>
14 |   <DataField name="glu" optype="continuous" dataType="double"/>
15 |   <DataField name="bp" optype="continuous" dataType="double"/>
16 |   <DataField name="skin" optype="continuous" dataType="double"/>
17 |   <DataField name="bmi" optype="continuous" dataType="double"/>
18 |   <DataField name="ped" optype="continuous" dataType="double"/>
19 |   <DataField name="age" optype="categorical" dataType="string">
20 |    <Value value="(20,30]"/>
21 |    <Value value="(30,40]"/>
22 |    <Value value="(40,50]"/>
23 |    <Value value="(50,60]"/>
24 |    <Value value="(60,70]"/>
25 |   </DataField>
26 |  </DataDictionary>
27 |  <TreeModel modelName="RPart_Model" functionName="classification" algorithmName="rpart" splitCharacteristic="binarySplit" missingValueStrategy="defaultChild" noTrueChildStrategy="returnLastPrediction">
28 |   <MiningSchema>
29 |    <MiningField name="type" usageType="predicted" invalidValueTreatment="returnInvalid"/>
30 |    <MiningField name="npreg" usageType="active" invalidValueTreatment="returnInvalid"/>
31 |    <MiningField name="glu" usageType="active" invalidValueTreatment="returnInvalid"/>
32 |    <MiningField name="bp" usageType="active" invalidValueTreatment="returnInvalid"/>
33 |    <MiningField name="skin" usageType="active" invalidValueTreatment="returnInvalid"/>
34 |    <MiningField name="bmi" usageType="active" invalidValueTreatment="returnInvalid"/>
35 |    <MiningField name="ped" usageType="active" invalidValueTreatment="returnInvalid"/>
36 |    <MiningField name="age" usageType="active" invalidValueTreatment="returnInvalid"/>
37 |   </MiningSchema>
38 |   <Output>
39 |    <OutputField name="Predicted_type" optype="categorical" dataType="string" feature="predictedValue"/>
40 |    <OutputField name="Probability_No" optype="continuous" dataType="double" feature="probability" value="No"/>
41 |    <OutputField name="Probability_Yes" optype="continuous" dataType="double" feature="probability" value="Yes"/>
42 |   </Output>
43 |   <Node id="1" score="No" recordCount="128" defaultChild="2">
44 |    <True/>
45 |    <ScoreDistribution value="No" recordCount="64" confidence="0.5"/>
46 |    <ScoreDistribution value="Yes" recordCount="64" confidence="0.5"/>
47 |    <Node id="2" score="No" recordCount="69" defaultChild="4">
48 |     <SimpleSetPredicate field="age" booleanOperator="isIn">
49 |      <Array n="2" type="string">&quot;(20,30]&quot; &quot;(60,70]&quot;</Array>
50 |     </SimpleSetPredicate>
51 |     <ScoreDistribution value="No" recordCount="50" confidence="0.72463768115942"/>
52 |     <ScoreDistribution value="Yes" recordCount="19" confidence="0.27536231884058"/>
53 |     <Node id="4" score="No" recordCount="35">
54 |      <SimplePredicate field="skin" operator="lessThan" value="25.18735"/>
55 |      <ScoreDistribution value="No" recordCount="33" confidence="0.942857142857143"/>
56 |      <ScoreDistribution value="Yes" recordCount="2" confidence="0.0571428571428571"/>
57 |     </Node>
58 |     <Node id="5" score="No" recordCount="34" defaultChild="10">
59 |      <SimplePredicate field="skin" operator="greaterOrEqual" value="25.18735"/>
60 |      <ScoreDistribution value="No" recordCount="17" confidence="0.5"/>
61 |      <ScoreDistribution value="Yes" recordCount="17" confidence="0.5"/>
62 |      <Node id="10" score="No" recordCount="19">
63 |       <SimplePredicate field="glu" operator="lessThan" value="125.5"/>
64 |       <ScoreDistribution value="No" recordCount="14" confidence="0.736842105263158"/>
65 |       <ScoreDistribution value="Yes" recordCount="5" confidence="0.263157894736842"/>
66 |      </Node>
67 |      <Node id="11" score="Yes" recordCount="15">
68 |       <SimplePredicate field="glu" operator="greaterOrEqual" value="125.5"/>
69 |       <ScoreDistribution value="No" recordCount="3" confidence="0.2"/>
70 |       <ScoreDistribution value="Yes" recordCount="12" confidence="0.8"/>
71 |      </Node>
72 |     </Node>
73 |    </Node>
74 |    <Node id="3" score="Yes" recordCount="59" defaultChild="6">
75 |     <SimpleSetPredicate field="age" booleanOperator="isNotIn">
76 |      <Array n="3" type="string">&quot;(20,30]&quot; &quot;(60,70]&quot;</Array>
77 |     </SimpleSetPredicate>
78 |     <ScoreDistribution value="No" recordCount="14" confidence="0.23728813559322"/>
79 |     <ScoreDistribution value="Yes" recordCount="45" confidence="0.76271186440678"/>
80 |     <Node id="6" score="No" recordCount="8">
81 |      <SimplePredicate field="skin" operator="lessThan" value="20.02033"/>
82 |      <ScoreDistribution value="No" recordCount="6" confidence="0.75"/>
83 |      <ScoreDistribution value="Yes" recordCount="2" confidence="0.25"/>
84 |     </Node>
85 |     <Node id="7" score="Yes" recordCount="51">
86 |      <SimplePredicate field="skin" operator="greaterOrEqual" value="20.02033"/>
87 |      <ScoreDistribution value="No" recordCount="8" confidence="0.156862745098039"/>
88 |      <ScoreDistribution value="Yes" recordCount="43" confidence="0.843137254901961"/>
89 |     </Node>
90 |    </Node>
91 |   </Node>
92 |  </TreeModel>
93 | </PMML>
94 | 


--------------------------------------------------------------------------------
/models/tree-cat.pmml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <PMML version="4.3" xmlns="http://www.dmg.org/PMML-4_3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.dmg.org/PMML-4_3 http://www.dmg.org/pmml/v4-3/pmml-4-3.xsd">
 3 |     <Header copyright="Copyright (c) 2019 dennis" description="RPart Decision Tree Model">
 4 |         <Extension name="user" value="dennis" extender="Rattle/PMML"/>
 5 |         <Application name="Rattle/PMML" version="1.4"/>
 6 |         <Timestamp>2019-02-01 14:07:13</Timestamp>
 7 |     </Header>
 8 |     <DataDictionary numberOfFields="3">
 9 |         <DataField name="type" optype="categorical" dataType="string">
10 |             <Value value="class1"/>
11 |             <Value value="class2"/>
12 |             <Value value="class3"/>
13 |         </DataField>
14 |         <DataField name="test" optype="categorical" dataType="string">
15 |             <Value value="category A"/>
16 |             <Value value="category B"/>
17 |             <Value value="category C"/>
18 |         </DataField>
19 |     </DataDictionary>
20 |     <TreeModel modelName="RPart_Model" functionName="classification" algorithmName="rpart" splitCharacteristic="binarySplit" missingValueStrategy="defaultChild" noTrueChildStrategy="returnLastPrediction">
21 |         <MiningSchema>
22 |             <MiningField name="type" usageType="predicted" invalidValueTreatment="returnInvalid"/>
23 |             <MiningField name="test" usageType="active" invalidValueTreatment="returnInvalid"/>
24 |         </MiningSchema>
25 |         <Output>
26 |             <OutputField name="Predicted_type" optype="categorical" dataType="string" feature="predictedValue"/>
27 |             <OutputField name="Probability_class1" optype="continuous" dataType="double" feature="probability" value="class1"/>
28 |             <OutputField name="Probability_class2" optype="continuous" dataType="double" feature="probability" value="class2"/>
29 |             <OutputField name="Probability_class3" optype="continuous" dataType="double" feature="probability" value="class3"/>
30 |         </Output>
31 |         <Node id="1" recordCount="150">
32 |             <True/>
33 | 
34 |             <Node id="2" score="class1" recordCount="10">
35 |                 <SimpleSetPredicate field="test" booleanOperator="isIn">
36 |                     <Array n="1" type="string">&quot;category A&quot;</Array>
37 |                 </SimpleSetPredicate>
38 | 
39 |                 <ScoreDistribution value="class1" recordCount="8" />
40 |                 <ScoreDistribution value="class2" recordCount="1" />
41 |                 <ScoreDistribution value="class3" recordCount="1" />
42 |             </Node>
43 | 
44 |             <Node id="3" recordCount="2">
45 |                 <SimpleSetPredicate field="test" booleanOperator="isIn">
46 |                     <Array n="2" type="string">&quot;category B&quot; &quot;category C&quot;</Array>
47 |                 </SimpleSetPredicate>
48 | 
49 |                 <Node id="4" score="class2" recordCount="40">
50 |                     <SimplePredicate field="test" operator="notEqual" value="category C"/>
51 |                     <ScoreDistribution value="class1" recordCount="1" />
52 |                     <ScoreDistribution value="class2" recordCount="38" />
53 |                     <ScoreDistribution value="class3" recordCount="1" />
54 |                 </Node>
55 | 
56 |                 <Node id="5" score="class3" recordCount="100">
57 |                     <SimpleSetPredicate field="test" booleanOperator="isIn">
58 |                         <Array n="1" type="string">&quot;category C&quot;</Array>
59 |                     </SimpleSetPredicate>
60 |                     <ScoreDistribution value="class1" recordCount="1" />
61 |                     <ScoreDistribution value="class2" recordCount="1" />
62 |                     <ScoreDistribution value="class3" recordCount="98" />
63 |                 </Node>
64 |             </Node>
65 | 
66 |         </Node>
67 |     </TreeModel>
68 | </PMML>
69 | 


--------------------------------------------------------------------------------
/models/tree-iris.pmml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
  2 | <PMML xmlns="http://www.dmg.org/PMML-4_4" xmlns:data="http://jpmml.org/jpmml-model/InlineTable" version="4.4">
  3 | 	<Header>
  4 | 		<Application name="JPMML-SkLearn" version="1.6.18"/>
  5 | 		<Timestamp>2021-07-06T10:18:03Z</Timestamp>
  6 | 	</Header>
  7 | 	<MiningBuildTask>
  8 | 		<Extension>PMMLPipeline(steps=[('classifier', DecisionTreeClassifier(random_state=1))])</Extension>
  9 | 	</MiningBuildTask>
 10 | 	<DataDictionary>
 11 | 		<DataField name="Class" optype="categorical" dataType="integer">
 12 | 			<Value value="0"/>
 13 | 			<Value value="1"/>
 14 | 			<Value value="2"/>
 15 | 		</DataField>
 16 | 		<DataField name="sepal length (cm)" optype="continuous" dataType="float"/>
 17 | 		<DataField name="petal length (cm)" optype="continuous" dataType="float"/>
 18 | 		<DataField name="petal width (cm)" optype="continuous" dataType="float"/>
 19 | 	</DataDictionary>
 20 | 	<TransformationDictionary/>
 21 | 	<TreeModel functionName="classification" algorithmName="sklearn.tree._classes.DecisionTreeClassifier" missingValueStrategy="nullPrediction">
 22 | 		<MiningSchema>
 23 | 			<MiningField name="Class" usageType="target"/>
 24 | 			<MiningField name="petal width (cm)"/>
 25 | 			<MiningField name="petal length (cm)"/>
 26 | 			<MiningField name="sepal length (cm)"/>
 27 | 		</MiningSchema>
 28 | 		<Output>
 29 | 			<OutputField name="probability(0)" optype="continuous" dataType="double" feature="probability" value="0"/>
 30 | 			<OutputField name="probability(1)" optype="continuous" dataType="double" feature="probability" value="1"/>
 31 | 			<OutputField name="probability(2)" optype="continuous" dataType="double" feature="probability" value="2"/>
 32 | 		</Output>
 33 | 		<LocalTransformations>
 34 | 			<DerivedField name="double(petal width (cm))" optype="continuous" dataType="double">
 35 | 				<FieldRef field="petal width (cm)"/>
 36 | 			</DerivedField>
 37 | 			<DerivedField name="double(petal length (cm))" optype="continuous" dataType="double">
 38 | 				<FieldRef field="petal length (cm)"/>
 39 | 			</DerivedField>
 40 | 			<DerivedField name="double(sepal length (cm))" optype="continuous" dataType="double">
 41 | 				<FieldRef field="sepal length (cm)"/>
 42 | 			</DerivedField>
 43 | 		</LocalTransformations>
 44 | 		<Node>
 45 | 			<True/>
 46 | 			<Node score="0" recordCount="50">
 47 | 				<SimplePredicate field="double(petal width (cm))" operator="lessOrEqual" value="0.800000011920929"/>
 48 | 				<ScoreDistribution value="0" recordCount="50"/>
 49 | 				<ScoreDistribution value="1" recordCount="0"/>
 50 | 				<ScoreDistribution value="2" recordCount="0"/>
 51 | 			</Node>
 52 | 			<Node>
 53 | 				<SimplePredicate field="double(petal width (cm))" operator="lessOrEqual" value="1.75"/>
 54 | 				<Node>
 55 | 					<SimplePredicate field="double(petal length (cm))" operator="lessOrEqual" value="4.950000047683716"/>
 56 | 					<Node score="1" recordCount="47">
 57 | 						<SimplePredicate field="double(petal width (cm))" operator="lessOrEqual" value="1.6500000357627869"/>
 58 | 						<ScoreDistribution value="0" recordCount="0"/>
 59 | 						<ScoreDistribution value="1" recordCount="47"/>
 60 | 						<ScoreDistribution value="2" recordCount="0"/>
 61 | 					</Node>
 62 | 					<Node score="2" recordCount="1">
 63 | 						<True/>
 64 | 						<ScoreDistribution value="0" recordCount="0"/>
 65 | 						<ScoreDistribution value="1" recordCount="0"/>
 66 | 						<ScoreDistribution value="2" recordCount="1"/>
 67 | 					</Node>
 68 | 				</Node>
 69 | 				<Node score="2" recordCount="3">
 70 | 					<SimplePredicate field="double(petal width (cm))" operator="lessOrEqual" value="1.550000011920929"/>
 71 | 					<ScoreDistribution value="0" recordCount="0"/>
 72 | 					<ScoreDistribution value="1" recordCount="0"/>
 73 | 					<ScoreDistribution value="2" recordCount="3"/>
 74 | 				</Node>
 75 | 				<Node score="1" recordCount="2">
 76 | 					<SimplePredicate field="double(sepal length (cm))" operator="lessOrEqual" value="6.949999809265137"/>
 77 | 					<ScoreDistribution value="0" recordCount="0"/>
 78 | 					<ScoreDistribution value="1" recordCount="2"/>
 79 | 					<ScoreDistribution value="2" recordCount="0"/>
 80 | 				</Node>
 81 | 				<Node score="2" recordCount="1">
 82 | 					<True/>
 83 | 					<ScoreDistribution value="0" recordCount="0"/>
 84 | 					<ScoreDistribution value="1" recordCount="0"/>
 85 | 					<ScoreDistribution value="2" recordCount="1"/>
 86 | 				</Node>
 87 | 			</Node>
 88 | 			<Node>
 89 | 				<SimplePredicate field="double(petal length (cm))" operator="lessOrEqual" value="4.8500001430511475"/>
 90 | 				<Node score="1" recordCount="1">
 91 | 					<SimplePredicate field="double(sepal length (cm))" operator="lessOrEqual" value="5.950000047683716"/>
 92 | 					<ScoreDistribution value="0" recordCount="0"/>
 93 | 					<ScoreDistribution value="1" recordCount="1"/>
 94 | 					<ScoreDistribution value="2" recordCount="0"/>
 95 | 				</Node>
 96 | 				<Node score="2" recordCount="2">
 97 | 					<True/>
 98 | 					<ScoreDistribution value="0" recordCount="0"/>
 99 | 					<ScoreDistribution value="1" recordCount="0"/>
100 | 					<ScoreDistribution value="2" recordCount="2"/>
101 | 				</Node>
102 | 			</Node>
103 | 			<Node score="2" recordCount="43">
104 | 				<True/>
105 | 				<ScoreDistribution value="0" recordCount="0"/>
106 | 				<ScoreDistribution value="1" recordCount="0"/>
107 | 				<ScoreDistribution value="2" recordCount="43"/>
108 | 			</Node>
109 | 		</Node>
110 | 	</TreeModel>
111 | </PMML>
112 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=40.8.0", "wheel", "Cython", "numpy>=1.16.0"]
3 | build-backend = "setuptools.build_meta:__legacy__"
4 | 
5 | [tool.cibuildwheel]
6 | before-build = "python -m pip install cython numpy"
7 | skip = "pp*"
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==1.26.4
 2 | scipy==1.15.1
 3 | scikit-learn==1.5.2
 4 | cached-property==2.0.1
 5 | pytest==8.3.4
 6 | pandas==2.2.3
 7 | pytest-cov==6.0.0
 8 | Cython==3.0.10
 9 | sphinx==8.1.3
10 | numpydoc==1.8.0
11 | sphinx-autoapi==3.4.0
12 | pydata-sphinx-theme==0.16.1
13 | sphinx-github-changelog==1.4.0
14 | myst-parser==4.0.0
15 | sklearn2pmml==0.113.0
16 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, print_function, absolute_import
  2 | from sklearn_pmml_model import __version__ as version
  3 | import platform
  4 | 
  5 | # Choose build type.
  6 | build_type="optimized" # "debug"
  7 | 
  8 | # Long description for package homepage on PyPI
  9 | with open("README.md", "r") as fh:
 10 |   long_description = fh.read()
 11 | 
 12 | #########################################################
 13 | # Init
 14 | #########################################################
 15 | 
 16 | # check for Python 2.7 or later
 17 | # http://stackoverflow.com/questions/19534896/enforcing-python-version-in-setup-py
 18 | import sys
 19 | if sys.version_info < (2,7):
 20 |   sys.exit('Sorry, Python < 2.7 is not supported')
 21 | 
 22 | import os
 23 | 
 24 | from setuptools import setup, find_packages
 25 | from setuptools.extension import Extension
 26 | 
 27 | try:
 28 |   from Cython.Build import cythonize
 29 | except ImportError:
 30 |   sys.exit("Cython not found. Cython is needed to build the extension modules.")
 31 | 
 32 | 
 33 | #########################################################
 34 | # Definitions
 35 | #########################################################
 36 | 
 37 | # Define our base set of compiler and linker flags.
 38 | #
 39 | # This is geared toward x86_64, see
 40 | #    https://gcc.gnu.org/onlinedocs/gcc-4.6.4/gcc/i386-and-x86_002d64-Options.html
 41 | #
 42 | # Customize these as needed.
 43 | #
 44 | # Note that -O3 may sometimes cause mysterious problems, so we limit ourselves to -O2.
 45 | 
 46 | # Modules involving numerical computations
 47 | if platform.system() == 'Darwin' and platform.machine() == 'aarch64' or 'universal2' or 'arm64' in os.environ.get('CIBW_ARCHS', ''):  # Apple Silicon
 48 |   extra_compile_args_math_optimized = ['-O2']
 49 |   extra_compile_args_math_debug = ['-O0', '-g']
 50 | else:
 51 |   extra_compile_args_math_optimized = ['-mtune=native', '-march=native', '-O2', '-msse', '-msse2', '-mfma', '-mfpmath=sse']
 52 |   extra_compile_args_math_debug = ['-mtune=native', '-march=native', '-O0', '-g']
 53 | 
 54 | extra_link_args_math_optimized       = []
 55 | extra_link_args_math_debug           = []
 56 | 
 57 | # Modules that do not involve numerical computations
 58 | extra_compile_args_nonmath_optimized = ['-O2']
 59 | extra_compile_args_nonmath_debug     = ['-O0', '-g']
 60 | extra_link_args_nonmath_optimized    = []
 61 | extra_link_args_nonmath_debug        = []
 62 | 
 63 | # Additional flags to compile/link with OpenMP
 64 | openmp_compile_args = ['-fopenmp']
 65 | openmp_link_args    = ['-fopenmp']
 66 | 
 67 | 
 68 | #########################################################
 69 | # Helpers
 70 | #########################################################
 71 | 
 72 | # Make absolute cimports work.
 73 | #
 74 | # See
 75 | #     https://github.com/cython/cython/wiki/PackageHierarchy
 76 | #
 77 | # For example: my_include_dirs = [np.get_include()]
 78 | import numpy as np
 79 | my_include_dirs = [".", np.get_include()]
 80 | 
 81 | 
 82 | # Choose the base set of compiler and linker flags.
 83 | if build_type == 'optimized':
 84 |   my_extra_compile_args_math    = extra_compile_args_math_optimized
 85 |   my_extra_compile_args_nonmath = extra_compile_args_nonmath_optimized
 86 |   my_extra_link_args_math       = extra_link_args_math_optimized
 87 |   my_extra_link_args_nonmath    = extra_link_args_nonmath_optimized
 88 |   my_debug = False
 89 |   print( "build configuration selected: optimized" )
 90 | elif build_type == 'debug':
 91 |   my_extra_compile_args_math    = extra_compile_args_math_debug
 92 |   my_extra_compile_args_nonmath = extra_compile_args_nonmath_debug
 93 |   my_extra_link_args_math       = extra_link_args_math_debug
 94 |   my_extra_link_args_nonmath    = extra_link_args_nonmath_debug
 95 |   my_debug = True
 96 |   print( "build configuration selected: debug" )
 97 | else:
 98 |   raise ValueError("Unknown build configuration '%s'; valid: 'optimized', 'debug'" % (build_type))
 99 | 
100 | 
101 | def declare_cython_extension(extName, use_math=False, use_openmp=False, include_dirs=None):
102 |   """Declare a Cython extension module for setuptools.
103 |   Parameters:
104 |     extName : str
105 |         Absolute module name, e.g. use `mylibrary.mypackage.mymodule`
106 |         for the Cython source file `mylibrary/mypackage/mymodule.pyx`.
107 |     use_math : bool
108 |         If True, set math flags and link with ``libm``.
109 |     use_openmp : bool
110 |         If True, compile and link with OpenMP.
111 |   Return value:
112 |     Extension object
113 |         that can be passed to ``setuptools.setup``.
114 |   """
115 |   extPath = extName.replace(".", os.path.sep)+".pyx"
116 | 
117 |   if use_math and os.name != 'nt': # Windows crashes when using m library
118 |     compile_args = list(my_extra_compile_args_math) # copy
119 |     link_args    = list(my_extra_link_args_math)
120 |     libraries    = ["m"]  # link libm; this is a list of library names without the "lib" prefix
121 |   else:
122 |     compile_args = list(my_extra_compile_args_nonmath)
123 |     link_args    = list(my_extra_link_args_nonmath)
124 |     libraries    = None  # value if no libraries, see setuptools.extension._Extension
125 | 
126 |   # OpenMP
127 |   if use_openmp:
128 |     compile_args.insert( 0, openmp_compile_args )
129 |     link_args.insert( 0, openmp_link_args )
130 | 
131 |   # See
132 |   #    http://docs.cython.org/src/tutorial/external.html
133 |   #
134 |   # on linking libraries to your Cython extensions.
135 |   return Extension(
136 |     extName,
137 |     [extPath],
138 |     extra_compile_args=compile_args,
139 |     extra_link_args=link_args,
140 |     include_dirs=include_dirs,
141 |     libraries=libraries
142 |   )
143 | 
144 | 
145 | #########################################################
146 | # Set up modules
147 | #########################################################
148 | 
149 | ext_module_tree      = declare_cython_extension("sklearn_pmml_model.tree._tree", use_math=True, use_openmp=False, include_dirs=my_include_dirs)
150 | ext_module_quad_tree = declare_cython_extension("sklearn_pmml_model.tree.quad_tree", use_math=True, use_openmp=False, include_dirs=my_include_dirs)
151 | ext_module_criterion = declare_cython_extension("sklearn_pmml_model.tree._criterion", use_math=True, use_openmp=False, include_dirs=my_include_dirs)
152 | ext_module_splitter  = declare_cython_extension("sklearn_pmml_model.tree._splitter", use_math=True, use_openmp=False, include_dirs=my_include_dirs)
153 | ext_module_utils     = declare_cython_extension("sklearn_pmml_model.tree._utils", use_math=True, use_openmp=False, include_dirs=my_include_dirs)
154 | ext_module_gb        = declare_cython_extension("sklearn_pmml_model.ensemble._gradient_boosting", use_math=True, use_openmp=False, include_dirs=my_include_dirs)
155 | 
156 | cython_ext_modules = [ext_module_tree, ext_module_quad_tree, ext_module_criterion, ext_module_splitter, ext_module_utils, ext_module_gb]
157 | 
158 | # Call cythonize() explicitly, as recommended in the Cython documentation. See
159 | #     http://cython.readthedocs.io/en/latest/src/reference/compilation.html#compiling-with-distutils
160 | #
161 | # This will favor Cython's own handling of '.pyx' sources over that provided by setuptools.
162 | #
163 | # Note that my_ext_modules is just a list of Extension objects. We could add any C sources (not coming from Cython modules) here if needed.
164 | # cythonize() just performs the Cython-level processing, and returns a list of Extension objects.
165 | my_ext_modules = cythonize(cython_ext_modules, include_path=my_include_dirs, gdb_debug=my_debug, compiler_directives={'legacy_implicit_noexcept': True})
166 | 
167 | 
168 | #########################################################
169 | # Call setup()
170 | #########################################################
171 | 
172 | setup(
173 |   name="sklearn-pmml-model",
174 |   version=version,
175 |   author="Dennis Collaris",
176 |   author_email="d.collaris@me.com",
177 |   description = "A library to parse PMML models into Scikit-learn estimators.",
178 |   long_description = long_description,
179 |   long_description_content_type="text/markdown",
180 |   url="https://github.com/iamDecode/sklearn-pmml-model",
181 |   license = "BSD-2-Clause",
182 |   classifiers = [
183 |     "Programming Language :: Python :: 3",
184 |     "License :: OSI Approved :: BSD License",
185 |     "Operating System :: OS Independent",
186 |     "Intended Audience :: Developers",
187 |     "Intended Audience :: Science/Research",
188 |     "Topic :: Software Development",
189 |     "Topic :: Scientific/Engineering"
190 |   ],
191 | 
192 |   setup_requires = ["cython", "numpy>=1.16.0", "pytest-runner"],
193 |   install_requires = [
194 |     'numpy>=1.16.0',
195 |     'pandas',
196 |     'scipy',
197 |     'scikit-learn',
198 |     'cached-property'
199 |   ],
200 |   tests_require = [
201 |     'pytest',
202 |   ],
203 |   ext_modules = my_ext_modules,
204 |   packages=find_packages(),
205 | 
206 |   # Install also Cython headers so that other Cython modules can cimport ours
207 |   #
208 |   # Fileglobs relative to each package, **does not** automatically recurse into subpackages.
209 |   # FIXME: force sdist, but sdist only, to keep the .pyx files (this puts them also in the bdist)
210 |   package_data={'sklearn_pmml_model.tree': ['*.pxd', '*.pyx'], 'sklearn_pmml_model.ensemble': ['*.pxd', '*.pyx']},
211 | 
212 |   # Disable zip_safe, because:
213 |   #   - Cython won't find .pxd files inside installed .egg, hard to compile libs depending on this one
214 |   #   - dynamic loader may need to have the library unzipped to a temporary directory anyway (at import time)
215 |   zip_safe = False
216 | )
217 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | sklearn-pmml-model.
 3 | 
 4 | A Python library that provides import functionality to all major estimator
 5 | classes of the popular machine learning library scikit-learn using PMML.
 6 | This enables portability and interoperability with a wide range of different
 7 | languages, toolkits and enterprise software.
 8 | """
 9 | 
10 | # License: BSD 2-Clause
11 | 
12 | __version__ = '1.0.7'
13 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/auto_detect/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`sklearn.auto_detect` module implements methods to automatically
 3 | detect the type of model from a PMML file.
 4 | """
 5 | 
 6 | # License: BSD 2-Clause
 7 | 
 8 | from .base import auto_detect_estimator, auto_detect_classifier, auto_detect_regressor
 9 | 
10 | __all__ = [
11 |   'auto_detect_estimator',
12 |   'auto_detect_classifier',
13 |   'auto_detect_regressor',
14 | ]
15 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/auto_detect/base.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | from collections.abc import Iterator
  3 | 
  4 | from sklearn_pmml_model.base import PMMLBaseEstimator
  5 | from sklearn_pmml_model.datatypes import Category
  6 | from sklearn_pmml_model.tree import PMMLTreeClassifier, PMMLTreeRegressor
  7 | from sklearn_pmml_model.ensemble import PMMLForestClassifier, PMMLForestRegressor, PMMLGradientBoostingClassifier, \
  8 |   PMMLGradientBoostingRegressor
  9 | from sklearn_pmml_model.neural_network import PMMLMLPClassifier, PMMLMLPRegressor
 10 | from sklearn_pmml_model.svm import PMMLSVC, PMMLSVR
 11 | from sklearn_pmml_model.naive_bayes import PMMLGaussianNB
 12 | from sklearn_pmml_model.linear_model import PMMLLogisticRegression, PMMLLinearRegression, PMMLRidgeClassifier, PMMLRidge
 13 | from sklearn_pmml_model.neighbors import PMMLKNeighborsClassifier, PMMLKNeighborsRegressor
 14 | 
 15 | 
 16 | def auto_detect_estimator(pmml, **kwargs):
 17 |   """
 18 |   Automatically detect and return the described estimator from PMML file.
 19 | 
 20 |   Parameters
 21 |   ----------
 22 |   pmml : str, object
 23 |       Filename or file object containing PMML data.
 24 | 
 25 |   """
 26 |   if isinstance(pmml, io.IOBase) and not pmml.seekable():
 27 |     content = pmml.read()
 28 |     if isinstance(content, bytes):
 29 |       pmml = io.BytesIO(content)
 30 |     if isinstance(content, str):
 31 |       pmml = io.StringIO(content)
 32 | 
 33 |   base = PMMLBaseEstimator(pmml=pmml)
 34 |   target_field_name = base.target_field.attrib['name']
 35 |   target_field_type = base.field_mapping[target_field_name][1]
 36 | 
 37 |   if isinstance(target_field_type, Category) or target_field_type is str:
 38 |     return auto_detect_classifier(pmml, **kwargs)
 39 |   else:
 40 |     return auto_detect_regressor(pmml, **kwargs)
 41 | 
 42 | 
 43 | def auto_detect_classifier(pmml, **kwargs):
 44 |   """
 45 |   Automatically detect and return the described classifier from PMML file.
 46 | 
 47 |   Parameters
 48 |   ----------
 49 |   pmml : str, object
 50 |       Filename or file object containing PMML data.
 51 | 
 52 |   """
 53 | 
 54 |   def parse(file: Iterator, seek=False):
 55 |     for line in file:
 56 |       if isinstance(line, bytes):
 57 |         line = line.decode('utf8')
 58 |       if '<Segmentation' in line:
 59 |         clfs = [x for x in (detect_classifier(line) for line in file) if x is not None]
 60 | 
 61 |         if all(clf is PMMLTreeClassifier or clf is PMMLLogisticRegression for clf in clfs):
 62 |           if 'multipleModelMethod="majorityVote"' in line or 'multipleModelMethod="average"' in line:
 63 |             if seek:
 64 |               pmml.seek(0)
 65 |             return PMMLForestClassifier(pmml=pmml, **kwargs)
 66 |           if 'multipleModelMethod="modelChain"' in line:
 67 |             if seek:
 68 |               pmml.seek(0)
 69 |             return PMMLGradientBoostingClassifier(pmml=pmml, **kwargs)
 70 | 
 71 |         raise Exception('Unsupported PMML classifier: invalid segmentation.')
 72 | 
 73 |       clf = detect_classifier(line)
 74 |       if clf:
 75 |         if seek:
 76 |           pmml.seek(0)
 77 |         return clf(pmml, **kwargs)
 78 | 
 79 |     raise Exception('Unsupported PMML classifier.')
 80 | 
 81 |   if isinstance(pmml, str):
 82 |     with io.open(pmml, 'r') as f:
 83 |       return parse(f)
 84 |   else:
 85 |     pmml.seek(0)
 86 |     return parse(pmml, seek=True)
 87 | 
 88 | 
 89 | def auto_detect_regressor(pmml, **kwargs):
 90 |   """
 91 |   Automatically detect and return the described regressor from PMML file.
 92 | 
 93 |   Parameters
 94 |   ----------
 95 |   pmml : str, object
 96 |       Filename or file object containing PMML data.
 97 | 
 98 |   """
 99 | 
100 |   def parse(file: Iterator, seek=False):
101 |     for line in file:
102 |       if isinstance(line, bytes):
103 |         line = line.decode('utf8')
104 |       if '<Segmentation' in line:
105 |         regs = [x for x in (detect_regressor(line) for line in file) if x is not None]
106 | 
107 |         if all(reg is PMMLTreeRegressor or reg is PMMLLinearRegression for reg in regs):
108 |           if 'multipleModelMethod="majorityVote"' in line or 'multipleModelMethod="average"' in line:
109 |             if seek:
110 |               pmml.seek(0)
111 |             return PMMLForestRegressor(pmml=pmml, **kwargs)
112 |           if 'multipleModelMethod="sum"' in line:
113 |             if seek:
114 |               pmml.seek(0)
115 |             return PMMLGradientBoostingRegressor(pmml=pmml, **kwargs)
116 | 
117 |         raise Exception('Unsupported PMML regressor: invalid segmentation.')
118 | 
119 |       reg = detect_regressor(line)
120 |       if reg:
121 |         if seek:
122 |           pmml.seek(0)
123 |         return reg(pmml, **kwargs)
124 | 
125 |     raise Exception('Unsupported PMML regressor.')
126 | 
127 |   if isinstance(pmml, str):
128 |     with io.open(pmml, 'r') as f:
129 |       return parse(f)
130 |   else:
131 |     pmml.seek(0)
132 |     return parse(pmml, seek=True)
133 | 
134 | 
135 | def detect_classifier(line):
136 |   """
137 |   Detect the type of classifier in line if present.
138 | 
139 |   Parameters
140 |   ----------
141 |   line : str, bytes
142 |       Line of a PMML file as a string.
143 | 
144 |   pmml : str, object
145 |       Filename or file object containing PMML data.
146 | 
147 |   """
148 |   if isinstance(line, bytes):
149 |     line = line.decode('utf8')
150 | 
151 |   if '<TreeModel' in line:
152 |     return PMMLTreeClassifier
153 | 
154 |   if '<NeuralNetwork' in line:
155 |     return PMMLMLPClassifier
156 | 
157 |   if '<SupportVectorMachineModel' in line:
158 |     return PMMLSVC
159 | 
160 |   if '<NaiveBayesModel' in line:
161 |     return PMMLGaussianNB
162 | 
163 |   if '<GeneralRegressionModel' in line:
164 |     return PMMLRidgeClassifier
165 | 
166 |   if '<RegressionModel' in line:
167 |     return PMMLLogisticRegression
168 | 
169 |   if '<NearestNeighborModel' in line:
170 |     return PMMLKNeighborsClassifier
171 | 
172 |   return None
173 | 
174 | 
175 | def detect_regressor(line):
176 |   """
177 |   Detect the type of regressor in line if present.
178 | 
179 |   Parameters
180 |   ----------
181 |   line : str, bytes
182 |       Line of a PMML file as a string.
183 | 
184 |   pmml : str, object
185 |       Filename or file object containing PMML data.
186 | 
187 |   """
188 |   if isinstance(line, bytes):
189 |     line = line.decode('utf8')
190 | 
191 |   if '<TreeModel' in line:
192 |     return PMMLTreeRegressor
193 | 
194 |   if '<NeuralNetwork' in line:
195 |     return PMMLMLPRegressor
196 | 
197 |   if '<SupportVectorMachineModel' in line:
198 |     return PMMLSVR
199 | 
200 |   if '<GeneralRegressionModel' in line:
201 |     return PMMLRidge
202 | 
203 |   if '<RegressionModel' in line:
204 |     return PMMLLinearRegression
205 | 
206 |   if '<NearestNeighborModel' in line:
207 |     return PMMLKNeighborsRegressor
208 | 
209 |   return None
210 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/datatypes.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 2-Clause
 2 | 
 3 | import math
 4 | import operator as op
 5 | 
 6 | 
 7 | class Interval:
 8 |   """
 9 |   Class describing the interval (or range) of a numerical feature.
10 | 
11 |   Parameters
12 |   ----------
13 |   closure : string
14 |       String defining the closure of the interval, can be 'openClosed', 'openOpen', 'closedOpen' or 'closedClosed'.
15 | 
16 |   categories : list
17 |       List of all categories for a particular feature.
18 | 
19 |   ordered : bool
20 |       Boolean indicating whether the categories are ordinal (sorting categories makes sense) or not.
21 | 
22 |   """
23 | 
24 |   def __init__(self, closure, left_margin=None, right_margin=None):
25 |     assert left_margin is not None or right_margin is not None
26 |     if left_margin is not None and right_margin is not None:
27 |       assert left_margin <= right_margin
28 |     assert closure in ['openClosed', 'openOpen', 'closedOpen', 'closedClosed']
29 | 
30 |     self.closure = closure
31 |     self.leftMargin = float(left_margin or -math.inf)
32 |     self.rightMargin = float(right_margin or math.inf)
33 | 
34 |   def __contains__(self, value):
35 |     if isinstance(value, float) or isinstance(value, int):
36 |       closure_mapping = {
37 |         'openClosed': [op.lt, op.le],
38 |         'openOpen': [op.lt, op.lt],
39 |         'closedOpen': [op.le, op.lt],
40 |         'closedClosed': [op.le, op.le]
41 |       }
42 | 
43 |       left, right = closure_mapping[self.closure]
44 |       return left(self.leftMargin, value) and right(value, self.rightMargin)
45 | 
46 | 
47 | class Category:
48 |   """
49 |   Class describing a categorical data type.
50 | 
51 |   Parameters
52 |   ----------
53 |   base_type : callable
54 |       The original native data type of the category. For example, `str`, `int` or `float`.
55 | 
56 |   categories : list
57 |       List of all categories for a particular feature.
58 | 
59 |   ordered : bool
60 |       Boolean indicating whether the categories are ordinal (sorting categories makes sense) or not.
61 | 
62 |   """
63 | 
64 |   def __init__(self, base_type, categories, ordered=False):
65 |     assert isinstance(categories, list)
66 |     assert isinstance(ordered, bool)
67 | 
68 |     self.base_type = base_type
69 | 
70 |     self.categories = [base_type(cat) for cat in categories]
71 |     self.ordered = ordered
72 | 
73 |   def __eq__(self, other):
74 |     return isinstance(other, Category) and \
75 |       self.base_type == other.base_type and \
76 |       self.categories == other.categories and \
77 |       self.ordered == other.ordered
78 | 
79 |   def __contains__(self, item):
80 |     return item in self.categories
81 | 
82 |   def __call__(self, value):
83 |     value = self.base_type(value)
84 | 
85 |     if value not in self:
86 |       raise Exception(f'Invalid categorical value: {value}')
87 | 
88 |     return value
89 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/ensemble/README.md:
--------------------------------------------------------------------------------
 1 | # sklearn-pmml-model.ensemble
 2 | 
 3 | This package contains `PMMLForestClassifier` and `PMMLGradientBoostingClassifier`.
 4 | 
 5 | ## Example
 6 | A minimal working example is shown below:
 7 | 
 8 | ### Random Forest
 9 | ```python
10 | from sklearn.datasets import load_iris
11 | from sklearn.model_selection import train_test_split
12 | import pandas as pd
13 | import numpy as np
14 | from sklearn_pmml_model.ensemble import PMMLForestClassifier
15 | 
16 | # Prepare data
17 | iris = load_iris()
18 | X = pd.DataFrame(iris.data)
19 | X.columns = np.array(iris.feature_names)
20 | y = pd.Series(np.array(iris.target_names)[iris.target])
21 | y.name = "Class"
22 | Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.33, random_state=123)
23 | 
24 | clf = PMMLForestClassifier(pmml="models/randomForest.pmml")
25 | clf.predict(Xte)
26 | clf.score(Xte, yte)
27 | ```
28 | 
29 | 
30 | ### Gradient boosting
31 | 
32 | To export using `r2pmml`:
33 | 
34 | ```R
35 | library("xgboost")
36 | library("r2pmml")
37 | 
38 | data(iris)
39 | 
40 | iris_X = iris[, 1:4]
41 | iris_y = as.integer(iris[, 5]) - 1
42 | iris.matrix = model.matrix(~ . - 1, data = iris_X)
43 | iris.DMatrix = xgb.DMatrix(iris.matrix, label = iris_y)
44 | iris.fmap = as.fmap(iris.matrix)
45 | 
46 | # Train a model
47 | iris.xgb = xgboost(data = iris.DMatrix, missing = NULL, objective = "multi:softmax", num_class = 3, nrounds = 13)
48 | 
49 | # Export the model to PMML
50 | r2pmml(iris.xgb, "iris_xgb.pmml", fmap = iris.fmap, response_name = "Species", response_levels = c("setosa", "versicolor", "virginica"), missing = NULL, compact = TRUE)
51 | ```
52 | 
53 | And import:
54 | 
55 | ```python
56 | from sklearn.datasets import load_iris
57 | from sklearn.model_selection import train_test_split
58 | import pandas as pd
59 | import numpy as np
60 | from sklearn_pmml_model.ensemble import PMMLGradientBoostingClassifier
61 | 
62 | # Prepare data
63 | iris = load_iris()
64 | X = pd.DataFrame(iris.data)
65 | X.columns = np.array(iris.feature_names)
66 | y = pd.Series(np.array(iris.target_names)[iris.target])
67 | y.name = "Class"
68 | Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.33, random_state=123)
69 | 
70 | clf = PMMLGradientBoostingClassifier(pmml="models/gb-xgboost-iris.pmml")
71 | clf.predict(Xte)
72 | clf.score(Xte, yte)
73 | ```


--------------------------------------------------------------------------------
/sklearn_pmml_model/ensemble/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`sklearn_pmml_model.ensemble` module includes ensemble-based methods for
 3 | classification, regression and anomaly detection.
 4 | """
 5 | 
 6 | # License: BSD 2-Clause
 7 | 
 8 | from .forest import PMMLForestClassifier, PMMLForestRegressor
 9 | from .gb import PMMLGradientBoostingClassifier, PMMLGradientBoostingRegressor
10 | 
11 | __all__ = [
12 |     'PMMLForestClassifier',
13 |     'PMMLForestRegressor',
14 |     'PMMLGradientBoostingClassifier',
15 |     'PMMLGradientBoostingRegressor'
16 | ]
17 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/ensemble/forest.py:
--------------------------------------------------------------------------------
  1 | # License: BSD 2-Clause
  2 | 
  3 | import numpy as np
  4 | import warnings
  5 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
  6 | from sklearn_pmml_model.base import PMMLBaseClassifier, PMMLBaseRegressor, IntegerEncodingMixin
  7 | from sklearn_pmml_model.tree import get_tree
  8 | 
  9 | 
 10 | class PMMLForestClassifier(IntegerEncodingMixin, PMMLBaseClassifier, RandomForestClassifier):
 11 |   """
 12 |   A random forest classifier.
 13 | 
 14 |   A random forest is a meta estimator that fits a number of decision tree
 15 |   classifiers on various sub-samples of the dataset and uses averaging to
 16 |   improve the predictive accuracy and control over-fitting.
 17 | 
 18 |   The PMML model consists out of a <Segmentation> element, that contains
 19 |   various <Segment> elements. Each segment contains it's own <TreeModel>.
 20 |   For Random Forests, only segments with a <True/> predicate are supported.
 21 | 
 22 |   Parameters
 23 |   ----------
 24 |   pmml : str, object
 25 |       Filename or file object containing PMML data.
 26 | 
 27 |   n_jobs : int or None, optional (default=None)
 28 |       The number of jobs to run in parallel for the `predict` method.
 29 |       ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
 30 |       ``-1`` means using all processors.
 31 | 
 32 |   Notes
 33 |   -----
 34 |   Specification: http://dmg.org/pmml/v4-3/MultipleModels.html
 35 | 
 36 |   """
 37 | 
 38 |   def __init__(self, pmml, n_jobs=None):
 39 |     PMMLBaseClassifier.__init__(self, pmml)
 40 | 
 41 |     mining_model = self.root.find('MiningModel')
 42 |     if mining_model is None:
 43 |       raise Exception('PMML model does not contain MiningModel.')
 44 | 
 45 |     segmentation = mining_model.find('Segmentation')
 46 |     if segmentation is None:
 47 |       raise Exception('PMML model does not contain Segmentation.')
 48 | 
 49 |     if segmentation.get('multipleModelMethod') not in ['majorityVote', 'average']:
 50 |       raise Exception('PMML model ensemble should use majority vote or average.')
 51 | 
 52 |     # Parse segments
 53 |     segments = segmentation.findall('Segment')
 54 |     valid_segments = [segment for segment in segments if segment.find('True') is not None]
 55 | 
 56 |     if len(valid_segments) < len(segments):
 57 |       warnings.warn(
 58 |         'Warning: {} segment(s) ignored because of unsupported predicate.'
 59 |         .format(len(segments) - len(valid_segments))
 60 |       )
 61 | 
 62 |     n_estimators = len(valid_segments)
 63 |     RandomForestClassifier.__init__(self, n_estimators=n_estimators, n_jobs=n_jobs)
 64 |     self._validate_estimator()
 65 | 
 66 |     clf = self._make_estimator(append=False, random_state=123)
 67 |     clf.classes_ = self.classes_
 68 |     try:
 69 |       clf.n_features_in_ = self.n_features_in_
 70 |     except AttributeError:
 71 |       clf.n_features_ = self.n_features_
 72 |     clf.n_outputs_ = self.n_outputs_
 73 |     clf.n_classes_ = self.n_classes_
 74 |     self.template_estimator = clf
 75 | 
 76 |     self.estimators_ = [get_tree(self, s) for s in valid_segments]
 77 | 
 78 |     # Required after constructing trees, because categories may be inferred in
 79 |     # the parsing process
 80 |     target = self.target_field.get('name')
 81 |     fields = [field for name, field in self.fields.items() if name != target]
 82 |     for clf in self.estimators_:
 83 |       n_categories = np.asarray([
 84 |         len(self.field_mapping[field.get('name')][1].categories)
 85 |         if field.get('optype') == 'categorical' else -1
 86 |         for field in fields
 87 |         if field.tag == 'DataField'
 88 |       ], dtype=np.int32, order='C')
 89 |       clf.n_categories = n_categories
 90 |       clf.tree_.set_n_categories(n_categories)
 91 | 
 92 |     self.categorical = [x != -1 for x in self.estimators_[0].n_categories]
 93 | 
 94 |   def fit(self, x, y):
 95 |     return PMMLBaseClassifier.fit(self, x, y)
 96 | 
 97 |   def _more_tags(self):
 98 |     return RandomForestClassifier._more_tags(self)
 99 | 
100 | 
101 | class PMMLForestRegressor(IntegerEncodingMixin, PMMLBaseRegressor, RandomForestRegressor):
102 |   """
103 |   A random forest regressor.
104 | 
105 |   A random forest is a meta estimator that fits a number of decision tree
106 |   classifiers on various sub-samples of the dataset and uses averaging to
107 |   improve the predictive accuracy and control over-fitting.
108 | 
109 |   The PMML model consists out of a <Segmentation> element, that contains
110 |   various <Segment> elements. Each segment contains it's own <TreeModel>.
111 |   For Random Forests, only segments with a <True/> predicate are supported.
112 | 
113 |   Parameters
114 |   ----------
115 |   pmml : str, object
116 |       Filename or file object containing PMML data.
117 | 
118 |   n_jobs : int or None, optional (default=None)
119 |       The number of jobs to run in parallel for the `predict` method.
120 |       ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
121 |       ``-1`` means using all processors.
122 | 
123 |   Notes
124 |   -----
125 |   Specification: http://dmg.org/pmml/v4-3/MultipleModels.html
126 | 
127 |   """
128 | 
129 |   def __init__(self, pmml, n_jobs=None):
130 |     PMMLBaseRegressor.__init__(self, pmml)
131 | 
132 |     mining_model = self.root.find('MiningModel')
133 |     if mining_model is None:
134 |       raise Exception('PMML model does not contain MiningModel.')
135 | 
136 |     segmentation = mining_model.find('Segmentation')
137 |     if segmentation is None:
138 |       raise Exception('PMML model does not contain Segmentation.')
139 | 
140 |     if segmentation.get('multipleModelMethod') not in ['majorityVote', 'average']:
141 |       raise Exception('PMML model ensemble should use majority vote or average.')
142 | 
143 |     # Parse segments
144 |     segments = segmentation.findall('Segment')
145 |     valid_segments = [segment for segment in segments if segment.find('True') is not None]
146 | 
147 |     if len(valid_segments) < len(segments):
148 |       warnings.warn(
149 |         'Warning: {} segment(s) ignored because of unsupported predicate.'.format(
150 |           len(segments) - len(valid_segments)
151 |         )
152 |       )
153 | 
154 |     n_estimators = len(valid_segments)
155 |     self.n_outputs_ = 1
156 |     RandomForestRegressor.__init__(self, n_estimators=n_estimators, n_jobs=n_jobs)
157 |     self._validate_estimator()
158 | 
159 |     clf = self._make_estimator(append=False, random_state=123)
160 |     try:
161 |       clf.n_features_in_ = self.n_features_in_
162 |     except AttributeError:
163 |       clf.n_features_ = self.n_features_
164 |     clf.n_outputs_ = self.n_outputs_
165 |     self.template_estimator = clf
166 | 
167 |     self.estimators_ = [get_tree(self, s, rescale_factor=0.1) for s in valid_segments]
168 | 
169 |     # Required after constructing trees, because categories may be inferred in
170 |     # the parsing process
171 |     target = self.target_field.get('name')
172 |     fields = [field for name, field in self.fields.items() if name != target]
173 |     for clf in self.estimators_:
174 |       n_categories = np.asarray([
175 |         len(self.field_mapping[field.get('name')][1].categories)
176 |         if field.get('optype') == 'categorical' else -1
177 |         for field in fields
178 |         if field.tag == 'DataField'
179 |       ], dtype=np.int32, order='C')
180 |       clf.n_categories = n_categories
181 |       clf.tree_.set_n_categories(n_categories)
182 | 
183 |     self.categorical = [x != -1 for x in self.estimators_[0].n_categories]
184 | 
185 |   def fit(self, x, y):
186 |     return PMMLBaseRegressor.fit(self, x, y)
187 | 
188 |   def _more_tags(self):
189 |     return RandomForestRegressor._more_tags(self)
190 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/linear_model/README.md:
--------------------------------------------------------------------------------
 1 | # sklearn-pmml-model.linear_model
 2 | 
 3 | This package contains the `PMMLLinearRegression` (`lm` in R) as well as `PMMLRidge`, `PMMLLasso` and `PMMLElasticNet` (`glm` and `glmnet` in R).
 4 | 
 5 | ## Example
 6 | A minimal working example is shown below:
 7 | 
 8 | ```python
 9 | import pandas as pd
10 | from sklearn_pmml_model.linear_model import PMMLLinearRegression
11 | 
12 | # Prepare data
13 | df = pd.read_csv('models/categorical-test.csv')
14 | Xte = df.iloc[:, 1:]
15 | 
16 | clf = PMMLLinearRegression(pmml="models/linear-model-lm.pmml")
17 | clf.predict(Xte)
18 | ```


--------------------------------------------------------------------------------
/sklearn_pmml_model/linear_model/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`sklearn_pmml_model.linear_model` module implements generalized linear models.
 3 | """
 4 | 
 5 | # License: BSD 2-Clause
 6 | 
 7 | from .implementations import PMMLLinearRegression, PMMLLogisticRegression, PMMLRidge, \
 8 |     PMMLRidgeClassifier, PMMLLasso, PMMLElasticNet
 9 | 
10 | __all__ = [
11 |     'PMMLLinearRegression',
12 |     'PMMLLogisticRegression',
13 |     'PMMLRidge',
14 |     'PMMLRidgeClassifier',
15 |     'PMMLLasso',
16 |     'PMMLElasticNet'
17 | ]
18 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/linear_model/base.py:
--------------------------------------------------------------------------------
  1 | # License: BSD 2-Clause
  2 | 
  3 | from sklearn_pmml_model.base import PMMLBaseRegressor, PMMLBaseClassifier, OneHotEncodingMixin
  4 | import numpy as np
  5 | from itertools import chain
  6 | 
  7 | 
  8 | class PMMLGeneralizedLinearRegressor(OneHotEncodingMixin, PMMLBaseRegressor):
  9 |   """
 10 |   Abstract class for Generalized Linear Models (GLMs).
 11 | 
 12 |   The PMML model consists out of a <GeneralRegressionModel> element,
 13 |   containing a <ParamMatrix> element that contains zero or more <PCell>
 14 |   elements describing the coefficients for each parameter. Parameters
 15 |   are described in the <PPMatrix> element, that maps parameters to fields in
 16 |   the data.
 17 | 
 18 |   Parameters
 19 |   ----------
 20 |   pmml : str, object
 21 |     Filename or file object containing PMML data.
 22 | 
 23 |   Notes
 24 |   -----
 25 |   Specification: http://dmg.org/pmml/v4-3/GeneralRegression.html
 26 | 
 27 |   """
 28 | 
 29 |   def __init__(self, pmml):
 30 |     PMMLBaseRegressor.__init__(self, pmml)
 31 |     OneHotEncodingMixin.__init__(self)
 32 | 
 33 |     # Import coefficients and intercepts
 34 |     model = self.root.find('GeneralRegressionModel')
 35 | 
 36 |     if model is None:
 37 |       raise Exception('PMML model does not contain GeneralRegressionModel.')
 38 | 
 39 |     self.coef_ = np.array(_get_coefficients(self, model))
 40 |     self.intercept_ = _get_intercept(model)
 41 | 
 42 | 
 43 | class PMMLGeneralizedLinearClassifier(OneHotEncodingMixin, PMMLBaseClassifier):
 44 |   """
 45 |   Abstract class for Generalized Linear Models (GLMs).
 46 | 
 47 |   The PMML model consists out of a <GeneralRegressionModel> element,
 48 |   containing a <ParamMatrix> element that contains zero or more <PCell>
 49 |   elements describing the coefficients for each parameter. Parameters
 50 |   are described in the <PPMatrix> element, that maps parameters to fields in
 51 |   the data.
 52 | 
 53 |   Parameters
 54 |   ----------
 55 |   pmml : str, object
 56 |     Filename or file object containing PMML data.
 57 | 
 58 |   Notes
 59 |   -----
 60 |   Specification: http://dmg.org/pmml/v4-3/GeneralRegression.html
 61 | 
 62 |   """
 63 | 
 64 |   def __init__(self, pmml):
 65 |     PMMLBaseClassifier.__init__(self, pmml)
 66 |     OneHotEncodingMixin.__init__(self)
 67 | 
 68 |     # Import coefficients and intercepts
 69 |     model = self.root.find('GeneralRegressionModel')
 70 | 
 71 |     if model is None:
 72 |       raise Exception('PMML model does not contain GeneralRegressionModel.')
 73 | 
 74 |     self.coef_ = np.array([_get_coefficients(self, model)])
 75 |     self.intercept_ = _get_intercept(model)
 76 | 
 77 | 
 78 | def _get_coefficients(linear_model, model):
 79 |   """
 80 |   Obtain the coefficients for the GLM regression.
 81 | 
 82 |   Raises an exception when we notice non linear parameter configurations.
 83 | 
 84 |   Parameters
 85 |   ----------
 86 |   linear_model : PMMLGeneralizedLinearRegressor, PMMLGeneralizedLinearClassifier
 87 |       The PMML class representing the classifier. Should contain at least target_field,
 88 |       fields and field_mapping properties.
 89 | 
 90 |   model : eTree.Element
 91 |       The <GeneralRegressionModel> element that is assumed to contains a
 92 |       <PPMatrix> and <ParamMatrix> element.
 93 | 
 94 |   Returns
 95 |   -------
 96 |   coefficients: numpy.ndarray
 97 |       Coefficient value for every field. Zero if not present.
 98 | 
 99 |   """
100 |   pp = model.find('PPMatrix')
101 |   params = model.find('ParamMatrix')
102 | 
103 |   def coefficient_for_parameter(p):
104 |     if not p:
105 |       return 0
106 | 
107 |     pcells = params.findall(f"PCell[@parameterName='{p}']")
108 |     if len(pcells) > 1:
109 |       raise Exception('This model does not support multiple outputs.')
110 | 
111 |     if not pcells:
112 |       return 0
113 | 
114 |     return float(pcells[0].get('beta'))
115 | 
116 |   def parameter_for_category(cells, category):
117 |     cell = [cell for cell in cells if cell.get('value') == category]
118 | 
119 |     if not cell:
120 |       return None
121 | 
122 |     return cell[0].get('parameterName')
123 | 
124 |   def coefficients_for_field(name, field):
125 |     pp_cells = pp.findall(f"PPCell[@predictorName='{name}']")
126 | 
127 |     if not pp_cells:
128 |       return [0]
129 | 
130 |     if field.get('optype') != 'categorical':
131 |       if len(pp_cells) > 1:
132 |         raise Exception('PMML model is not linear.')
133 | 
134 |       return [coefficient_for_parameter(pp_cells[0].get('parameterName'))]
135 | 
136 |     return [
137 |       coefficient_for_parameter(parameter_for_category(pp_cells, c))
138 |       for c in linear_model.field_mapping[name][1].categories
139 |     ]
140 | 
141 |   target = linear_model.target_field.get('name')
142 |   fields = {name: field for name, field in linear_model.fields.items() if name != target}
143 | 
144 |   return list(chain.from_iterable([
145 |     coefficients_for_field(name, field)
146 |     for name, field in fields.items()
147 | 
148 |   ]))
149 | 
150 | 
151 | def _get_intercept(model):
152 |   """
153 |   Find all parameters that are not included in the <ParamMatrix>.
154 | 
155 |   These constitute the intercept. In the very unlikely case there are multiple
156 |   parameters fitting this criteria, we sum the result.
157 | 
158 |   Parameters
159 |   ----------
160 |   model : eTree.Element
161 |       The <GeneralRegressionModel> element that is assumed to contains a
162 |       <PPMatrix> and <ParamMatrix> element.
163 | 
164 |   Returns
165 |   -------
166 |   intercept : float
167 |       Value of the intercept of the method.
168 | 
169 |   """
170 |   pp = model.find('PPMatrix')
171 |   params = model.find('ParamMatrix')
172 | 
173 |   specified = [p.get('parameterName') for p in pp.findall('PPCell')]
174 |   used = [p.get('parameterName') for p in params.findall('PCell')]
175 | 
176 |   intercepts = set(used) - set(specified)
177 |   intercepts = list(chain.from_iterable([
178 |     params.findall(f"PCell[@parameterName='{p}']")
179 |     for p in intercepts
180 |   ]))
181 | 
182 |   return sum([float(i.get('beta')) for i in intercepts])
183 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/naive_bayes/README.md:
--------------------------------------------------------------------------------
 1 | # sklearn-pmml-model.naive_bayes
 2 | 
 3 | This package contains the `PMMLGaussianNB` classifier.
 4 | 
 5 | ## Example
 6 | A minimal working example is shown below:
 7 | 
 8 | ```python
 9 | import pandas as pd
10 | from sklearn_pmml_model.naive_bayes import PMMLGaussianNB
11 | 
12 | # Prepare data
13 | df = pd.read_csv('models/categorical-test.csv')
14 | Xte = df.iloc[:, 1:]
15 | 
16 | clf = PMMLGaussianNB(pmml="models/nb-cat-pima.pmml")
17 | clf.predict(Xte)
18 | ```


--------------------------------------------------------------------------------
/sklearn_pmml_model/naive_bayes/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`sklearn_pmml_model.naive_bayes` module implements Naive Bayes
 3 | algorithms. These are supervised learning methods based on applying Bayes'
 4 | theorem with strong (naive) feature independence assumptions.
 5 | """
 6 | 
 7 | # License: BSD 2-Clause
 8 | 
 9 | from .implementations import PMMLGaussianNB
10 | 
11 | __all__ = ['PMMLGaussianNB']
12 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/naive_bayes/implementations.py:
--------------------------------------------------------------------------------
  1 | # License: BSD 2-Clause
  2 | 
  3 | from sklearn_pmml_model.base import PMMLBaseClassifier, OneHotEncodingMixin
  4 | from sklearn.naive_bayes import GaussianNB
  5 | import numpy as np
  6 | from itertools import chain
  7 | 
  8 | 
  9 | class PMMLGaussianNB(OneHotEncodingMixin, PMMLBaseClassifier, GaussianNB):
 10 |   """
 11 |   Gaussian Naive Bayes classifier.
 12 | 
 13 |   Can perform online updates to model parameters via :meth:`partial_fit`.
 14 |   For details on algorithm used to update feature means and variance online,
 15 |   see Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:
 16 | 
 17 |       http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf
 18 | 
 19 |   Parameters
 20 |   ----------
 21 |   pmml : str, object
 22 |     Filename or file object containing PMML data.
 23 | 
 24 |   Notes
 25 |   -----
 26 |   Specification: http://dmg.org/pmml/v4-3/NaiveBayes.html
 27 | 
 28 |   """
 29 | 
 30 |   def __init__(self, pmml):
 31 |     PMMLBaseClassifier.__init__(self, pmml)
 32 |     OneHotEncodingMixin.__init__(self)
 33 | 
 34 |     model = self.root.find('NaiveBayesModel')
 35 | 
 36 |     if model is None:
 37 |       raise Exception('PMML model does not contain NaiveBayesModel.')
 38 | 
 39 |     inputs = model.find('BayesInputs')
 40 | 
 41 |     target_values = {
 42 |       target: self._get_target_values(inputs, target)
 43 |       for target in self.classes_
 44 |     }
 45 | 
 46 |     try:
 47 |       outputs = model.find('BayesOutput').find('TargetValueCounts').findall('TargetValueCount')
 48 |       counts = [int(x.get('count')) for x in outputs]
 49 |       self.class_prior_ = np.array([x / np.sum(counts) for x in counts])
 50 |     except AttributeError:
 51 |       self.class_prior_ = np.array([1 / len(self.classes_) for _ in self.classes_])
 52 | 
 53 |     self.theta_ = np.array([
 54 |       [float(value.get('mean', 0)) for value in target_values[target]]
 55 |       for target in self.classes_
 56 |     ])
 57 |     try:
 58 |       self.sigma_ = np.array([
 59 |         [float(value.get('variance', 0)) for value in target_values[target]]
 60 |         for target in self.classes_
 61 |       ])
 62 |     finally:
 63 |       pass
 64 | 
 65 |     try:
 66 |       self.var_ = np.array([
 67 |         [float(value.get('variance', 0)) for value in target_values[target]]
 68 |         for target in self.classes_
 69 |       ])
 70 |     finally:
 71 |       pass
 72 | 
 73 |   def _get_target_values(self, inputs, target):
 74 |     def target_value_for_category(bayesInput, category):
 75 |       counts = bayesInput.find(f"PairCounts[@value='{category}']")
 76 |       target_counts = counts.find('TargetValueCounts')
 77 |       return target_counts.find(f"TargetValueCount[@value='{target}']")
 78 | 
 79 |     def target_value_for_field(name, field):
 80 |       bayesInput = inputs.find(f"BayesInput[@fieldName='{name}']")
 81 | 
 82 |       if field.get('optype') != 'categorical':
 83 |         stats = bayesInput.find('TargetValueStats')
 84 |         targetValue = stats.find(f"TargetValueStat[@value='{target}']")
 85 |         distribution = targetValue.find('GaussianDistribution')
 86 | 
 87 |         if distribution is None:
 88 |           distributionName = targetValue.find('*').tag
 89 |           raise NotImplementedError(f'Distribution "{distributionName}" not implemented, or not supported '
 90 |                                     f'by scikit-learn')
 91 | 
 92 |         return [distribution]
 93 |       else:
 94 |         counts = [
 95 |           float(target_value_for_category(bayesInput, c).get('count'))
 96 |           for c in self.field_mapping[name][1].categories
 97 |         ]
 98 |         return [
 99 |           {
100 |             'mean': count / np.sum(counts),
101 |             'variance': 999999999
102 |           }
103 |           for count in counts
104 |         ]
105 | 
106 |     return list(chain.from_iterable([
107 |       target_value_for_field(name, field)
108 |       for name, field in self.fields.items()
109 |       if field is not self.target_field
110 |     ]))
111 | 
112 |   def fit(self, x, y):
113 |     return PMMLBaseClassifier.fit(self, x, y)
114 | 
115 |   def _more_tags(self):
116 |     return GaussianNB._more_tags(self)
117 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/neighbors/README.md:
--------------------------------------------------------------------------------
 1 | # sklearn-pmml-model.neighbors
 2 | 
 3 | This package contains `PMMLKNeighborsClassifier` and `PMMLKNeighborsRegressor`.
 4 | 
 5 | ## Example
 6 | A minimal working example is shown below:
 7 | 
 8 | ```python
 9 | import pandas as pd
10 | import numpy as np
11 | from sklearn_pmml_model.neighbors import PMMLKNeighborsClassifier
12 | 
13 | # Prepare data
14 | df = pd.read_csv('models/categorical-test.csv')
15 | cats = np.unique(df['age'])
16 | df['age'] = pd.Categorical(df['age'], categories=cats).codes + 1
17 | Xte = df.iloc[:, 1:]
18 | 
19 | clf = PMMLKNeighborsClassifier(pmml="models/knn-clf-pima.pmml")
20 | clf.predict(Xte)
21 | ```


--------------------------------------------------------------------------------
/sklearn_pmml_model/neighbors/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`sklearn.neighbors` module implements the k-nearest neighbors
 3 | algorithm.
 4 | """
 5 | 
 6 | # License: BSD 2-Clause
 7 | 
 8 | from ._classes import PMMLKNeighborsClassifier, PMMLKNeighborsRegressor
 9 | 
10 | __all__ = ['PMMLKNeighborsClassifier', 'PMMLKNeighborsRegressor']
11 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/neighbors/_base.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 2-Clause
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | 
 7 | class PMMLBaseKNN:
 8 |   """
 9 |   Abstract class for k-nearest neighbor models.
10 | 
11 |   The PMML model consists out of a <NearestNeighborModel> element,
12 |   containing a <SupportVectorMachine> element that contains a <SupportVectors>
13 |   element describing support vectors, and a <TrainingInstances> element that
14 |   describes the training data points, a <ComparisonMeasure> element describing
15 |   the distance metric used, and a <KNNInputs> element describing which features
16 |   are considered when calculating distance. k is indicated using the
17 |   numberOfNeighbors attribute on the <NearestNeighborModel> element.
18 | 
19 |   Parameters
20 |   ----------
21 |   leaf_size : int, default=30
22 |     Leaf size passed to BallTree or KDTree. This can affect the speed of the
23 |     construction and query, as well as the memory required to store the tree.
24 |     The optimal value depends on the nature of the problem.
25 | 
26 |   Notes
27 |   -----
28 |   Specification: http://dmg.org/pmml/v4-3/KNN.html
29 | 
30 |   """
31 | 
32 |   def __init__(self, leaf_size=30):
33 |     knn_model = self.root.find('NearestNeighborModel')
34 | 
35 |     if knn_model is None:
36 |       raise Exception('PMML model does not contain NearestNeighborModel.')
37 | 
38 |     self.n_neighbors = int(knn_model.get('numberOfNeighbors'))
39 |     self.algorithm = 'auto'
40 |     self.leaf_size = leaf_size
41 |     self.p = 2
42 |     self.metric_params = None
43 |     self.outputs_2d_ = False
44 | 
45 |     # Set metric and parameters
46 |     measure_element = knn_model.find('ComparisonMeasure')
47 | 
48 |     if measure_element is None:
49 |       raise Exception('PMML model does not contain ComparisonMeasure.')
50 | 
51 |     measure = next(x for x in measure_element)
52 | 
53 |     measures = {
54 |       'euclidean': 'euclidean',
55 |       'chebychev': 'chebyshev',
56 |       'cityBlock': 'manhattan',
57 |       'minkowski': 'minkowski',
58 |       'simpleMatching': 'matching',
59 |       'jaccard': 'jaccard',
60 |       'tanimoto': 'rogerstanimoto',
61 |     }
62 | 
63 |     if measure.tag not in measures:
64 |       raise Exception(f'PMML model uses unsupported distance metric: "{measure.tag}".')
65 | 
66 |     self.metric = measures[measure.tag]
67 | 
68 |     if self.metric == 'minkowski':
69 |       self.p = float(measure.get('p-parameter'))
70 |       self.metric_params = {'p': self.p}
71 | 
72 |     self._check_algorithm_metric()
73 | 
74 |     # Set training instances
75 |     instances = knn_model.find('TrainingInstances')
76 | 
77 |     fields_element = instances.find('InstanceFields')
78 |     mapping = {x.get('field'): x.get('column').split(':')[-1] for x in fields_element}
79 |     target = self.target_field.get('name')
80 |     fields = [x.get('field') for x in fields_element if x.get('field') != target]
81 | 
82 |     data = [
83 |       [
84 |         self.field_mapping[f][1](next(x for x in row if x.tag.endswith(mapping[f])).text)
85 |         for f in fields
86 |       ]
87 |       for row in instances.find('InlineTable')
88 |     ]
89 | 
90 |     self._X = pd.DataFrame(data, columns=fields)
91 |     self._y = np.array([
92 |       self.field_mapping[target][1](next(x for x in row if x.tag.endswith(mapping[target])).text)
93 |       for row in instances.find('InlineTable')
94 |     ])
95 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/neighbors/_classes.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 2-Clause
 2 | 
 3 | from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
 4 | from sklearn_pmml_model.base import PMMLBaseClassifier, PMMLBaseRegressor
 5 | from sklearn_pmml_model.neighbors._base import PMMLBaseKNN
 6 | 
 7 | 
 8 | class PMMLKNeighborsClassifier(PMMLBaseClassifier, PMMLBaseKNN, KNeighborsClassifier):
 9 |   """
10 |   Classifier implementing the k-nearest neighbors vote.
11 | 
12 |   Parameters
13 |   ----------
14 |   pmml : str, object
15 |     Filename or file object containing PMML data.
16 | 
17 |   n_jobs : int, default=None
18 |     The number of parallel jobs to run for neighbors search.
19 |     ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
20 |     ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
21 |     for more details.
22 |     Doesn't affect :meth:`fit` method.
23 | 
24 |   Notes
25 |   -----
26 |   Specification: http://dmg.org/pmml/v4-3/KNN.html
27 | 
28 |   """
29 | 
30 |   def __init__(self, pmml, n_jobs=None):
31 |     PMMLBaseClassifier.__init__(self, pmml)
32 |     KNeighborsClassifier.__init__(self, n_jobs=n_jobs)
33 |     PMMLBaseKNN.__init__(self)
34 | 
35 |     KNeighborsClassifier.fit(self, self._X, self._y)
36 | 
37 |   def fit(self, x, y):
38 |     return PMMLBaseClassifier.fit(self, x, y)
39 | 
40 |   def _more_tags(self):
41 |     return {'requires_y': True, **KNeighborsClassifier._more_tags(self)}
42 | 
43 | 
44 | class PMMLKNeighborsRegressor(PMMLBaseRegressor, PMMLBaseKNN, KNeighborsRegressor):
45 |   """
46 |   Regression based on k-nearest neighbors.
47 | 
48 |   The target is predicted by local interpolation of the targets
49 |   associated of the nearest neighbors in the training set.
50 | 
51 |   Parameters
52 |   ----------
53 |   pmml : str, object
54 |     Filename or file object containing PMML data.
55 | 
56 |   n_jobs : int, default=None
57 |     The number of parallel jobs to run for neighbors search.
58 |     ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
59 |     ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
60 |     for more details.
61 |     Doesn't affect :meth:`fit` method.
62 | 
63 |   Notes
64 |   -----
65 |   Specification: http://dmg.org/pmml/v4-3/KNN.html
66 | 
67 |   """
68 | 
69 |   def __init__(self, pmml, n_jobs=None):
70 |     PMMLBaseRegressor.__init__(self, pmml)
71 |     KNeighborsRegressor.__init__(self, n_jobs=n_jobs)
72 |     PMMLBaseKNN.__init__(self)
73 | 
74 |     KNeighborsRegressor.fit(self, self._X, self._y)
75 | 
76 |   def fit(self, x, y):
77 |     return PMMLBaseRegressor.fit(self, x, y)
78 | 
79 |   def _more_tags(self):
80 |     return KNeighborsRegressor._more_tags(self)
81 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/neural_network/README.md:
--------------------------------------------------------------------------------
 1 | # sklearn-pmml-model.neural_network
 2 | 
 3 | This package contains `PMMLMLPClassifier` and `PMMLMLPRegressor`.
 4 | 
 5 | ## Example
 6 | A minimal working example is shown below:
 7 | 
 8 | ```python
 9 | import numpy as np
10 | import pandas as pd
11 | from sklearn_pmml_model.neural_network import PMMLMLPClassifier
12 | from sklearn.datasets import load_iris
13 | 
14 | # Prepare data
15 | data = load_iris(as_frame=True)
16 | X = data.data
17 | y = pd.Series(np.array(data.target_names)[data.target])
18 | y.name = "Class"
19 | 
20 | clf = PMMLMLPClassifier(pmml="models/nn-iris.pmml")
21 | clf.predict(X)
22 | ```


--------------------------------------------------------------------------------
/sklearn_pmml_model/neural_network/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`sklearn.neural_network` module includes models based on neural networks.
 3 | """
 4 | 
 5 | # License: BSD 2-Clause
 6 | 
 7 | from ._classes import PMMLMLPClassifier, PMMLMLPRegressor
 8 | 
 9 | __all__ = ['PMMLMLPClassifier', 'PMMLMLPRegressor']
10 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/neural_network/_base.py:
--------------------------------------------------------------------------------
  1 | # License: BSD 2-Clause
  2 | 
  3 | import numpy as np
  4 | from sklearn_pmml_model.base import PMMLBaseClassifier
  5 | 
  6 | 
  7 | class PMMLBaseNeuralNetwork:
  8 |   """
  9 |   Abstract class for Neural Network models.
 10 | 
 11 |   The PMML model consists out of a <NeuralNetwork> element, containing a
 12 |   <NeuralInputs> element that describes the input layer neurons with
 13 |   <NeuralInput> elements. Next, a <NeuralLayer> element describes all other
 14 |   neurons with associated weights and biases. The activation function is either
 15 |   specified globally with the activationFunction attribute on the
 16 |   <NeuralNetwork> element, or the same attribute on each layer. Note however
 17 |   that scikit-learn only supports a single activation function for all hidden
 18 |   layers. Finally, the <NeuralOutputs> element describes the output layer.
 19 |   The output is currently expected to match the target field in <MiningSchema>.
 20 | 
 21 |   Notes
 22 |   -----
 23 |   Specification: http://dmg.org/pmml/v4-3/NeuralNetwork.html
 24 | 
 25 |   """
 26 | 
 27 |   def __init__(self):
 28 |     nn_model = self.root.find('NeuralNetwork')
 29 | 
 30 |     if nn_model is None:
 31 |       raise Exception('PMML model does not contain NeuralNetwork.')
 32 | 
 33 |     inputs = nn_model.find('NeuralInputs')
 34 | 
 35 |     if inputs is None:
 36 |       raise Exception('PMML model does not contain NeuralInputs.')
 37 | 
 38 |     mapping = {
 39 |       x.find('DerivedField').find('FieldRef').get('field'): x.get('id')
 40 |       for x in inputs.findall('NeuralInput')
 41 |     }
 42 | 
 43 |     target = self.target_field.get('name')
 44 |     fields = [name for name, field in self.fields.items() if name != target and field.tag == 'DataField']
 45 |     if set(mapping.keys()) != set(fields):
 46 |       raise Exception('PMML model preprocesses the data which currently unsupported.')
 47 | 
 48 |     layers = [layer for layer in nn_model.findall('NeuralLayer')]
 49 |     if isinstance(self, PMMLBaseClassifier) and len(self.classes_) == 2:
 50 |       index = next((i + 1 for i, layer in enumerate(layers) if layer.get('activationFunction') == 'identity'), None)
 51 |       layers = layers[:index]
 52 | 
 53 |     if len(layers) == 0:
 54 |       raise Exception('PMML model does not contain any NeuralLayer elements.')
 55 | 
 56 |     self.n_layers_ = len(layers) + 1  # +1 for input layer
 57 | 
 58 |     neurons = [layer.findall('Neuron') for layer in layers]
 59 |     self.hidden_layer_sizes = [len(neuron) for neuron in neurons][:-1]
 60 | 
 61 |     # Determine activation function
 62 |     activation_functions = {
 63 |       'logistic': 'logistic',
 64 |       'tanh': 'tanh',
 65 |       'identity': 'identity',
 66 |       'rectifier': 'relu'
 67 |     }
 68 |     activation_function = nn_model.get('activationFunction')
 69 | 
 70 |     if activation_function is None:
 71 |       activation_function = layers[0].get('activationFunction')
 72 | 
 73 |     layer_activations = [
 74 |       layer.get('activationFunction')
 75 |       for layer in layers[:-1]
 76 |       if layer.get('activationFunction') is not None
 77 |     ]
 78 | 
 79 |     if len(np.unique([activation_function] + layer_activations)) > 1:
 80 |       raise Exception('Neural networks with different activation functions per '
 81 |                       'layer are not currently supported by scikit-learn.')
 82 | 
 83 |     if activation_function not in activation_functions:
 84 |       raise Exception('PMML model uses unsupported activationFunction.')
 85 | 
 86 |     self.activation = activation_functions[activation_function]
 87 | 
 88 |     # Set neuron weights
 89 |     sizes = list(zip(
 90 |       [len(mapping)] + [len(layer) for layer in layers][:-1],
 91 |       [len(layer) for layer in layers]
 92 |     ))
 93 | 
 94 |     self.coefs_ = [np.zeros(shape=s) for s in sizes]
 95 |     self.intercepts_ = [
 96 |       np.array([float(neuron.get('bias', 0)) for neuron in layer])
 97 |       for layer in neurons
 98 |     ]
 99 | 
100 |     field_ids = [mapping[field] for field in fields]
101 |     for li, layer in enumerate(neurons):
102 |       if li == 0:
103 |         layer_ids = field_ids
104 |       else:
105 |         layer_ids = [x.get('id') for x in neurons[li - 1]]
106 |       for ni, neuron in enumerate(layer):
107 |         for connection in neuron.findall('Con'):
108 |           ci = layer_ids.index(connection.get('from'))
109 |           self.coefs_[li][ci, ni] = float(connection.get('weight'))
110 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/neural_network/_classes.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 2-Clause
 2 | 
 3 | import numpy as np
 4 | from sklearn.neural_network import MLPClassifier, MLPRegressor
 5 | from sklearn.preprocessing import LabelBinarizer
 6 | from sklearn.utils.multiclass import type_of_target
 7 | from sklearn_pmml_model.base import PMMLBaseClassifier, PMMLBaseRegressor, get_type
 8 | from sklearn_pmml_model.datatypes import Category
 9 | from sklearn_pmml_model.neural_network._base import PMMLBaseNeuralNetwork
10 | 
11 | 
12 | class PMMLMLPClassifier(PMMLBaseClassifier, PMMLBaseNeuralNetwork, MLPClassifier):
13 |   """
14 |   Multi-layer Perceptron classifier.
15 | 
16 |   Parameters
17 |   ----------
18 |   pmml : str, object
19 |     Filename or file object containing PMML data.
20 | 
21 |   Notes
22 |   -----
23 |   Specification: http://dmg.org/pmml/v4-3/NeuralNetwork.html
24 | 
25 |   """
26 | 
27 |   def __init__(self, pmml):
28 |     PMMLBaseClassifier.__init__(self, pmml)
29 |     MLPClassifier.__init__(self)
30 |     PMMLBaseNeuralNetwork.__init__(self)
31 | 
32 |     if len(self.classes_) == 2:
33 |       self.out_activation_ = "logistic"
34 |       self.n_outputs_ = 1
35 |     else:
36 |       self.out_activation_ = "softmax"
37 |       self.n_outputs_ = len(self.classes_)
38 | 
39 |     target_type: Category = get_type(self.target_field)
40 |     self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
41 |     self._label_binarizer.classes_ = np.array(target_type.categories)
42 |     self._label_binarizer.y_type_ = type_of_target(target_type.categories)
43 |     self._label_binarizer.sparse_input_ = False
44 | 
45 |   def fit(self, x, y):
46 |     return PMMLBaseClassifier.fit(self, x, y)
47 | 
48 |   def _more_tags(self):
49 |     return MLPClassifier._more_tags(self)
50 | 
51 | 
52 | class PMMLMLPRegressor(PMMLBaseRegressor, PMMLBaseNeuralNetwork, MLPRegressor):
53 |   """
54 |   Multi-layer Perceptron regressor.
55 | 
56 |   Parameters
57 |   ----------
58 |   pmml : str, object
59 |     Filename or file object containing PMML data.
60 | 
61 |   Notes
62 |   -----
63 |   Specification: http://dmg.org/pmml/v4-3/NeuralNetwork.html
64 | 
65 |   """
66 | 
67 |   def __init__(self, pmml):
68 |     PMMLBaseRegressor.__init__(self, pmml)
69 |     MLPRegressor.__init__(self)
70 |     PMMLBaseNeuralNetwork.__init__(self)
71 | 
72 |     self.out_activation_ = "identity"
73 | 
74 |   def fit(self, x, y):
75 |     return PMMLBaseRegressor.fit(self, x, y)
76 | 
77 |   def _more_tags(self):
78 |     return MLPRegressor._more_tags(self)
79 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/svm/README.md:
--------------------------------------------------------------------------------
 1 | # sklearn-pmml-model.svm
 2 | 
 3 | This package contains the `PMMLLinearSVC`, `PMMLNuSVC` and `PMMLSVC` classifier models, as well as the `PMMLLinearSVR`, `PMMLNuSVR` and `PMMLSVR` regression models. 
 4 | 
 5 | ## Example
 6 | A minimal working example is shown below:
 7 | 
 8 | ```python
 9 | import pandas as pd
10 | from sklearn_pmml_model.svm import PMMLSVC
11 | 
12 | # Prepare data
13 | df = pd.read_csv('models/categorical-test.csv')
14 | Xte = df.iloc[:, 1:]
15 | 
16 | clf = PMMLSVC(pmml="models/svc-cat-pima.pmml")
17 | clf.predict(Xte)
18 | ```


--------------------------------------------------------------------------------
/sklearn_pmml_model/svm/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`sklearn.svm` module includes Support Vector Machine algorithms.
 3 | """
 4 | 
 5 | # License: BSD 2-Clause
 6 | 
 7 | from ._classes import PMMLLinearSVC, PMMLLinearSVR, PMMLNuSVC, PMMLNuSVR, PMMLSVC, PMMLSVR
 8 | 
 9 | __all__ = ['PMMLLinearSVC', 'PMMLLinearSVR', 'PMMLNuSVC', 'PMMLNuSVR', 'PMMLSVC', 'PMMLSVR']
10 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/svm/_base.py:
--------------------------------------------------------------------------------
  1 | # License: BSD 2-Clause
  2 | 
  3 | from sklearn_pmml_model.base import PMMLBaseRegressor, PMMLBaseClassifier, parse_array
  4 | import numpy as np
  5 | 
  6 | 
  7 | class PMMLBaseSVM:
  8 |   """
  9 |   Abstract class for Support Vector Machines.
 10 | 
 11 |   The PMML model consists out of a <SupportVectorMachineModel> element,
 12 |   containing a <SupportVectorMachine> element that contains a <SupportVectors>
 13 |   element describing support vectors, and a <Coefficients> element describing
 14 |   the coefficients for each support vector. Support vectors are referenced from
 15 |   a <VectorDictionary> element, in which the true support vectors are described
 16 |   using <VectorInstance> elements. Furthermore, the model contains one out of
 17 |   <LinearKernelType>, <PolynomialKernelType>, <RadialBasisKernelType> or
 18 |   <SigmoidKernelType> describing the kernel function used.
 19 | 
 20 |   Parameters
 21 |   ----------
 22 |   pmml : str, object
 23 |     Filename or file object containing PMML data.
 24 | 
 25 |   Notes
 26 |   -----
 27 |   Specification: http://dmg.org/pmml/v4-3/SupportVectorMachineModel.html
 28 | 
 29 |   """
 30 | 
 31 |   def __init__(self):
 32 |     # Import coefficients and intercepts
 33 |     model = self.root.find('SupportVectorMachineModel')
 34 | 
 35 |     if model is None:
 36 |       raise Exception('PMML model does not contain SupportVectorMachineModel.')
 37 | 
 38 |     vector_dictionary = model.find('VectorDictionary')
 39 |     svms = model.findall('SupportVectorMachine')
 40 |     coefficients = [svm.find('Coefficients') for svm in svms]
 41 | 
 42 |     self.shape_fit_ = (0, len(vector_dictionary.find('VectorFields')))
 43 |     self.support_ = np.array([
 44 |       int(x.get('id'))
 45 |       for x in vector_dictionary.findall('VectorInstance')
 46 |     ]).astype(np.int32)
 47 | 
 48 |     classes = [None, None] if isinstance(self, PMMLBaseRegressor) else self.classes_
 49 | 
 50 |     self._n_support = np.array([
 51 |       len(get_overlapping_vectors(get_alt_svms(svms, classes, c)))
 52 |       for c in classes
 53 |     ]).astype(np.int32)
 54 | 
 55 |     self.support_vectors_ = np.array([
 56 |       get_vectors(vector_dictionary, s) for s in self.support_
 57 |     ])
 58 | 
 59 |     self._intercept_ = self.intercept_ = np.array([float(cs.get('absoluteValue')) for cs in coefficients])
 60 |     self._dual_coef_ = self.dual_coef_ = np.array(
 61 |       get_coefficients(classes, self._n_support, self.support_, svms)
 62 |     )
 63 | 
 64 |     if isinstance(self, PMMLBaseClassifier) and len(classes) == 2:
 65 |       self._n_support = (self._n_support / 2).astype(np.int32)
 66 | 
 67 |     linear = model.find('LinearKernelType')
 68 |     poly = model.find('PolynomialKernelType')
 69 |     rbf = model.find('RadialBasisKernelType')
 70 |     sigmoid = model.find('SigmoidKernelType')
 71 | 
 72 |     if linear is not None:
 73 |       self.kernel = 'linear'
 74 |       self._gamma = self.gamma = 0.0
 75 |     elif poly is not None:
 76 |       self.kernel = 'poly'
 77 |       self._gamma = self.gamma = float(poly.get('gamma'))
 78 |       self.coef0 = float(poly.get('coef0'))
 79 |       self.degree = int(poly.get('degree'))
 80 |     elif rbf is not None:
 81 |       self.kernel = 'rbf'
 82 |       self._gamma = self.gamma = float(rbf.get('gamma'))
 83 |     elif sigmoid is not None:
 84 |       self.kernel = 'sigmoid'
 85 |       self._gamma = self.gamma = float(sigmoid.get('gamma'))
 86 |       self.coef0 = float(sigmoid.get('coef0'))
 87 | 
 88 |     self._probA = np.array([])
 89 |     self._probB = np.array([])
 90 | 
 91 | 
 92 | def get_vectors(vector_dictionary, s):
 93 |   """Return support vector values, parsed as a numpy array."""
 94 |   instance = vector_dictionary.find(f"VectorInstance[@id='{s}']")
 95 | 
 96 |   if instance is None:
 97 |     raise Exception(f'PMML model is broken, vector instance (id = {s}) not found.')
 98 | 
 99 |   array = instance.find('Array')
100 |   if array is None:
101 |     array = instance.find('REAL-Array')
102 |   if array is None:
103 |     array = instance.find('SparseArray')
104 |   if array is None:
105 |     array = instance.find('REAL-SparseArray')
106 |   if array is None:
107 |     raise Exception(f'PMML model is broken, vector instance (id = {s}) does not contain (Sparse)Array element.')
108 | 
109 |   return np.array(parse_array(array))
110 | 
111 | 
112 | def get_alt_svms(svms, classes, target_class):
113 |   """
114 |   Find alternative SVMs (e.g., for target class 0, find the svms classifying 0 against 1, and 0 against 2).
115 | 
116 |   Parameters
117 |   ----------
118 |   svms : list
119 |       List of eTree.Element objects describing the different one-to-one support vector machines in the PMML.
120 | 
121 |   classes : numpy.array
122 |       The classes to be predicted by the model.
123 | 
124 |   target_class : str
125 |       The target class.
126 | 
127 |   Returns
128 |   -------
129 |   alt_svms : list
130 |       List of eTree.Elements filtered to only include SVMs comparing the target class against alternate classes.
131 | 
132 |   """
133 |   # Noop for regression
134 |   if classes[0] is None:
135 |     return svms
136 | 
137 |   alt_svms = [
138 |     svm for svm in svms
139 |     if svm.get('targetCategory') == str(target_class) or svm.get('alternateTargetCategory') == str(target_class)
140 |   ]
141 | 
142 |   # Sort svms based on target class order
143 |   alt_svms = [
144 |     next(svm for svm in alt_svms if svm.get('targetCategory') == str(c) or svm.get('alternateTargetCategory') == str(c))
145 |     for c in set(classes).difference({target_class})
146 |   ]
147 | 
148 |   return alt_svms
149 | 
150 | 
151 | def get_overlapping_vectors(svms):
152 |   """
153 |   Return support vector ids that are present in all provided SVM elements.
154 | 
155 |   Parameters
156 |   ----------
157 |   svms : list
158 |       List of eTree.Element objects describing the different one-to-one support vector machines in the PMML.
159 | 
160 |   Returns
161 |   -------
162 |   output : set
163 |     Set containing all integer vector ids that are present in all provided SVM elements.
164 | 
165 |   """
166 |   support_vectors = [svm.find('SupportVectors') for svm in svms]
167 |   vector_ids = [{int(x.get('vectorId')) for x in s.findall('SupportVector')} for s in support_vectors]
168 |   return set.intersection(*vector_ids)
169 | 
170 | 
171 | def get_coefficients(classes, n_support, support_ids, svms):
172 |   """
173 |   Return support vector coefficients.
174 | 
175 |   Parameters
176 |   ----------
177 |   classes : numpy.array
178 |       The classes to be predicted by the model.
179 | 
180 |   n_support : numpy.array
181 |       Numpy array describing the number of support vectors for each class.
182 | 
183 |   support_ids: list
184 |     A list describing the ids of all support vectors in the model.
185 | 
186 |   svms : list
187 |       List of eTree.Element objects describing the different one-to-one support vector machines in the PMML.
188 | 
189 |   """
190 |   dual_coef = np.zeros((len(classes) - 1, len(support_ids)))
191 | 
192 |   for i, x in enumerate(classes):
193 |     alt_svms = get_alt_svms(svms, classes, x)
194 |     offsets = [0] + np.cumsum(n_support).tolist()
195 | 
196 |     for j, svm in enumerate(alt_svms):
197 |       start = offsets[i]
198 |       end = offsets[i + 1]
199 |       ids = support_ids[start:end]
200 | 
201 |       support_vectors = [int(x.get('vectorId')) for x in svm.find('SupportVectors').findall('SupportVector')]
202 |       coefficients = [float(x.get('value')) for x in svm.find('Coefficients').findall('Coefficient')]
203 |       indices = [support_vectors.index(x) for x in ids]
204 |       dual_coef[j, start:end] = np.array(coefficients)[indices]
205 | 
206 |   return dual_coef
207 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/svm/_classes.py:
--------------------------------------------------------------------------------
  1 | # License: BSD 2-Clause
  2 | 
  3 | from sklearn.svm import LinearSVC, LinearSVR, NuSVC, NuSVR, SVC, SVR
  4 | import numpy as np
  5 | from scipy.sparse import isspmatrix
  6 | from sklearn_pmml_model.base import OneHotEncodingMixin, PMMLBaseClassifier, PMMLBaseRegressor
  7 | from sklearn_pmml_model.svm._base import PMMLBaseSVM
  8 | from sklearn_pmml_model.linear_model.implementations import _get_coefficients as _linear_get_coefficients
  9 | 
 10 | 
 11 | class PMMLLinearSVC(OneHotEncodingMixin, PMMLBaseClassifier, LinearSVC):
 12 |   """
 13 |   Linear Support Vector Classification.
 14 | 
 15 |   Similar to SVC with parameter kernel='linear', but implemented in terms of
 16 |   liblinear rather than libsvm, so it has more flexibility in the choice of
 17 |   penalties and loss functions and should scale better to large numbers of
 18 |   samples.
 19 | 
 20 |   This class supports both dense and sparse input and the multiclass support
 21 |   is handled according to a one-vs-the-rest scheme.
 22 | 
 23 |   The PMML model is assumed to be equivalent to PMMLLogisticRegression.
 24 | 
 25 |   Parameters
 26 |   ----------
 27 |   pmml : str, object
 28 |     Filename or file object containing PMML data.
 29 | 
 30 |   Notes
 31 |   -----
 32 |   Specification: http://dmg.org/pmml/v4-3/Regression.html
 33 | 
 34 |   """
 35 | 
 36 |   def __init__(self, pmml):
 37 |     PMMLBaseClassifier.__init__(self, pmml)
 38 |     OneHotEncodingMixin.__init__(self)
 39 |     LinearSVC.__init__(self)
 40 | 
 41 |     # Import coefficients and intercepts
 42 |     model = self.root.find('RegressionModel')
 43 | 
 44 |     if model is None:
 45 |       raise Exception('PMML model does not contain RegressionModel.')
 46 | 
 47 |     tables = [
 48 |       table for table in model.findall('RegressionTable')
 49 |       if table.find('NumericPredictor') is not None
 50 |     ]
 51 | 
 52 |     self.coef_ = [
 53 |       _linear_get_coefficients(self, table)
 54 |       for table in tables
 55 |     ]
 56 |     self.intercept_ = [
 57 |       float(table.get('intercept'))
 58 |       for table in tables
 59 |     ]
 60 | 
 61 |     if len(self.coef_) == 1:
 62 |       self.coef_ = [self.coef_[0]]
 63 | 
 64 |     if len(self.intercept_) == 1:
 65 |       self.intercept_ = [self.intercept_[0]]
 66 | 
 67 |     self.coef_ = np.array(self.coef_)
 68 |     self.intercept_ = np.array(self.intercept_)
 69 | 
 70 |   def fit(self, x, y):
 71 |     return PMMLBaseClassifier.fit(self, x, y)
 72 | 
 73 |   def _more_tags(self):
 74 |     return LinearSVC._more_tags(self)
 75 | 
 76 | 
 77 | class PMMLLinearSVR(OneHotEncodingMixin, PMMLBaseRegressor, LinearSVR):
 78 |   """
 79 |   Linear Support Vector Regression.
 80 | 
 81 |   Similar to SVR with parameter kernel='linear', but implemented in terms of
 82 |   liblinear rather than libsvm, so it has more flexibility in the choice of
 83 |   penalties and loss functions and should scale better to large numbers of
 84 |   samples.
 85 | 
 86 |   This class supports both dense and sparse input.
 87 | 
 88 |   The PMML model is assumed to be equivalent to PMMLLinearRegression.
 89 | 
 90 |   Parameters
 91 |   ----------
 92 |   pmml : str, object
 93 |     Filename or file object containing PMML data.
 94 | 
 95 |   Notes
 96 |   -----
 97 |   Specification: http://dmg.org/pmml/v4-3/Regression.html
 98 | 
 99 |   """
100 | 
101 |   def __init__(self, pmml):
102 |     PMMLBaseRegressor.__init__(self, pmml)
103 |     OneHotEncodingMixin.__init__(self)
104 | 
105 |     # Import coefficients and intercepts
106 |     model = self.root.find('RegressionModel')
107 | 
108 |     if model is None:
109 |       raise Exception('PMML model does not contain RegressionModel.')
110 | 
111 |     tables = model.findall('RegressionTable')
112 | 
113 |     self.coef_ = np.array([
114 |       _linear_get_coefficients(self, table)
115 |       for table in tables
116 |     ])
117 |     self.intercept_ = np.array([
118 |       float(table.get('intercept'))
119 |       for table in tables
120 |     ])
121 | 
122 |     if self.coef_.shape[0] == 1:
123 |       self.coef_ = self.coef_[0]
124 | 
125 |     if self.intercept_.shape[0] == 1:
126 |       self.intercept_ = self.intercept_[0]
127 | 
128 |   def fit(self, x, y):
129 |     return PMMLBaseRegressor.fit(self, x, y)
130 | 
131 |   def _more_tags(self):
132 |     return LinearSVR._more_tags(self)
133 | 
134 | 
135 | class PMMLNuSVC(OneHotEncodingMixin, PMMLBaseClassifier, PMMLBaseSVM, NuSVC):
136 |   """
137 |   Nu-Support Vector Classification.
138 | 
139 |   Similar to SVC but uses a parameter to control the number of support
140 |   vectors.
141 | 
142 |   The implementation is based on libsvm.
143 | 
144 |   Parameters
145 |   ----------
146 |   pmml : str, object
147 |     Filename or file object containing PMML data.
148 | 
149 |   Notes
150 |   -----
151 |   Specification: http://dmg.org/pmml/v4-3/SupportVectorMachine.html
152 | 
153 |   """
154 | 
155 |   def __init__(self, pmml):
156 |     PMMLBaseClassifier.__init__(self, pmml)
157 |     OneHotEncodingMixin.__init__(self)
158 |     NuSVC.__init__(self)
159 |     PMMLBaseSVM.__init__(self)
160 | 
161 |   def _prepare_data(self, X):
162 |     self._sparse = isspmatrix(X)
163 |     return super()._prepare_data(X)
164 | 
165 |   def decision_function(self, X, *args, **kwargs):
166 |     X = self._prepare_data(X)
167 |     return super().decision_function(X, *args, **kwargs)
168 | 
169 |   def fit(self, x, y):
170 |     return PMMLBaseClassifier.fit(self, x, y)
171 | 
172 |   def _more_tags(self):
173 |     return NuSVC._more_tags(self)
174 | 
175 | 
176 | class PMMLNuSVR(OneHotEncodingMixin, PMMLBaseRegressor, PMMLBaseSVM, NuSVR):
177 |   """
178 |   Nu Support Vector Regression.
179 | 
180 |   Similar to NuSVC, for regression, uses a parameter nu to control
181 |   the number of support vectors. However, unlike NuSVC, where nu
182 |   replaces C, here nu replaces the parameter epsilon of epsilon-SVR.
183 | 
184 |   The implementation is based on libsvm.
185 | 
186 |   Parameters
187 |   ----------
188 |   pmml : str, object
189 |     Filename or file object containing PMML data.
190 | 
191 |   Notes
192 |   -----
193 |   Specification: http://dmg.org/pmml/v4-3/SupportVectorMachine.html
194 | 
195 |   """
196 | 
197 |   def __init__(self, pmml):
198 |     PMMLBaseRegressor.__init__(self, pmml)
199 |     OneHotEncodingMixin.__init__(self)
200 |     NuSVR.__init__(self)
201 |     PMMLBaseSVM.__init__(self)
202 | 
203 |   def _prepare_data(self, X):
204 |     self._sparse = isspmatrix(X)
205 |     return super()._prepare_data(X)
206 | 
207 |   def fit(self, x, y):
208 |     return PMMLBaseRegressor.fit(self, x, y)
209 | 
210 |   def _more_tags(self):
211 |     return NuSVR._more_tags(self)
212 | 
213 | 
214 | class PMMLSVC(OneHotEncodingMixin, PMMLBaseClassifier, PMMLBaseSVM, SVC):
215 |   """
216 |   C-Support Vector Classification.
217 | 
218 |   The implementation is based on libsvm. The multiclass support is
219 |   handled according to a one-vs-one scheme.
220 | 
221 |   For details on the precise mathematical formulation of the provided
222 |   kernel functions and how `gamma`, `coef0` and `degree` affect each
223 |   other, see the corresponding section in the narrative documentation:
224 |   `Kernel functions <https://scikit-learn.org/stable/modules/svm.html#svm-kernels>`_.
225 | 
226 |   Parameters
227 |   ----------
228 |   pmml : str, object
229 |     Filename or file object containing PMML data.
230 | 
231 |   Notes
232 |   -----
233 |   Specification: http://dmg.org/pmml/v4-3/SupportVectorMachine.html
234 | 
235 |   """
236 | 
237 |   def __init__(self, pmml):
238 |     PMMLBaseClassifier.__init__(self, pmml)
239 |     OneHotEncodingMixin.__init__(self)
240 |     SVC.__init__(self)
241 |     PMMLBaseSVM.__init__(self)
242 | 
243 |   def _prepare_data(self, X):
244 |     self._sparse = isspmatrix(X)
245 |     return super()._prepare_data(X)
246 | 
247 |   def decision_function(self, X, *args, **kwargs):
248 |     X = self._prepare_data(X)
249 |     return super().decision_function(X, *args, **kwargs)
250 | 
251 |   def fit(self, x, y):
252 |     return PMMLBaseClassifier.fit(self, x, y)
253 | 
254 |   def _more_tags(self):
255 |     return SVC._more_tags(self)
256 | 
257 | 
258 | class PMMLSVR(OneHotEncodingMixin, PMMLBaseRegressor, PMMLBaseSVM, SVR):
259 |   """
260 |   Epsilon-Support Vector Regression.
261 | 
262 |   The free parameters in the model are C and epsilon. The implementation
263 |   is based on libsvm.
264 | 
265 |   For details on the precise mathematical formulation of the provided
266 |   kernel functions and how `gamma`, `coef0` and `degree` affect each
267 |   other, see the corresponding section in the narrative documentation:
268 |   `Kernel functions <https://scikit-learn.org/stable/modules/svm.html#svm-kernels>`_.
269 | 
270 |   Parameters
271 |   ----------
272 |   pmml : str, object
273 |     Filename or file object containing PMML data.
274 | 
275 |   Notes
276 |   -----
277 |   Specification: http://dmg.org/pmml/v4-3/SupportVectorMachine.html
278 | 
279 |   """
280 | 
281 |   def __init__(self, pmml):
282 |     PMMLBaseRegressor.__init__(self, pmml)
283 |     OneHotEncodingMixin.__init__(self)
284 |     SVR.__init__(self)
285 |     PMMLBaseSVM.__init__(self)
286 | 
287 |   def _prepare_data(self, X):
288 |     self._sparse = isspmatrix(X)
289 |     return super()._prepare_data(X)
290 | 
291 |   def fit(self, x, y):
292 |     return PMMLBaseRegressor.fit(self, x, y)
293 | 
294 |   def _more_tags(self):
295 |     return SVR._more_tags(self)
296 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/tree/README.md:
--------------------------------------------------------------------------------
 1 | # sklearn-pmml-model.tree
 2 | 
 3 | This package contains the `PMMLTreeClassifier`.
 4 | 
 5 | ## Example
 6 | A minimal working example is shown below:
 7 | 
 8 | ```python
 9 | from sklearn.datasets import load_iris
10 | from sklearn.model_selection import train_test_split
11 | import pandas as pd
12 | import numpy as np
13 | from sklearn_pmml_model.tree import PMMLTreeClassifier
14 | 
15 | # Prepare data
16 | iris = load_iris()
17 | X = pd.DataFrame(iris.data)
18 | X.columns = np.array(iris.feature_names)
19 | y = pd.Series(np.array(iris.target_names)[iris.target])
20 | y.name = "Class"
21 | Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.33, random_state=123)
22 | 
23 | clf = PMMLTreeClassifier(pmml="models/tree-iris.pmml")
24 | clf.predict(Xte)
25 | clf.score(Xte, yte)
26 | ```
27 | 
28 | To interpret the resulting tree, including categorical spits, we adapted the example from https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html:
29 | 
30 | ```python
31 | node_indicator = clf.decision_path(X)
32 | leaf_id = clf.apply(X)
33 | 
34 | sample_id = 0
35 | # obtain ids of the nodes `sample_id` goes through, i.e., row `sample_id`
36 | node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
37 |                                     node_indicator.indptr[sample_id + 1]]
38 | 
39 | print('Rules used to predict sample {id}:\n'.format(id=sample_id))
40 | for node_id in node_index:
41 |   # continue to the next node if it is a leaf node
42 |   if leaf_id[sample_id] == node_id:
43 |     continue
44 | 
45 |   # check if value of the split feature for sample 0 is below threshold
46 |   if isinstance(clf.tree_.threshold[node_id], list):
47 |     threshold_sign = "in"
48 |   elif (X.iloc[sample_id, clf.tree_.feature[node_id]] <= clf.tree_.threshold[node_id]):
49 |     threshold_sign = "<="
50 |   else:
51 |     threshold_sign = ">"
52 | 
53 |   print("decision node {node} : (X[{sample}, {feature}] = {value}) "
54 |         "{inequality} {threshold})".format(
55 |     node=node_id,
56 |     sample=sample_id,
57 |     feature=clf.tree_.feature[node_id],
58 |     value=X.iloc[sample_id, clf.tree_.feature[node_id]],
59 |     inequality=threshold_sign,
60 |     threshold=str(clf.tree_.threshold[node_id])))
61 | ```


--------------------------------------------------------------------------------
/sklearn_pmml_model/tree/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`sklearn_pmml_model.tree` module includes decision tree-based models for
 3 | classification and regression.
 4 | """
 5 | 
 6 | # License: BSD 2-Clause
 7 | 
 8 | from .tree import PMMLTreeClassifier, PMMLTreeRegressor, get_tree, clone
 9 | 
10 | __all__ = ['PMMLTreeClassifier', 'PMMLTreeRegressor', 'get_tree', 'clone']
11 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/tree/_criterion.pxd:
--------------------------------------------------------------------------------
 1 | # cython: language_level=3
 2 | 
 3 | # Authors: Gilles Louppe <g.louppe@gmail.com>
 4 | #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 5 | #          Brian Holt <bdholt1@gmail.com>
 6 | #          Joel Nothman <joel.nothman@gmail.com>
 7 | #          Arnaud Joly <arnaud.v.joly@gmail.com>
 8 | #          Jacob Schreiber <jmschreiber91@gmail.com>
 9 | #
10 | # License: BSD 3 clause
11 | 
12 | # See _criterion.pyx for implementation details.
13 | 
14 | import numpy as np
15 | cimport numpy as np
16 | 
17 | ctypedef np.npy_float32 DTYPE_t          # Type of X
18 | ctypedef np.npy_float64 DOUBLE_t         # Type of y, sample_weight
19 | ctypedef np.npy_intp SIZE_t              # Type for indices and counters
20 | ctypedef np.npy_int32 INT32_t            # Signed 32 bit integer
21 | ctypedef np.npy_uint32 UINT32_t          # Unsigned 32 bit integer
22 | 
23 | cdef class Criterion:
24 |     # The criterion computes the impurity of a node and the reduction of
25 |     # impurity of a split on that node. It also computes the output statistics
26 |     # such as the mean in regression and class probabilities in classification.
27 | 
28 |     # Internal structures
29 |     cdef DOUBLE_t* y                     # Values of y
30 |     cdef SIZE_t y_stride                 # Stride in y (since n_outputs >= 1)
31 |     cdef DOUBLE_t* sample_weight         # Sample weights
32 | 
33 |     cdef SIZE_t* samples                 # Sample indices in X, y
34 |     cdef SIZE_t start                    # samples[start:pos] are the samples in the left node
35 |     cdef SIZE_t pos                      # samples[pos:end] are the samples in the right node
36 |     cdef SIZE_t end
37 | 
38 |     cdef SIZE_t n_outputs                # Number of outputs
39 |     cdef SIZE_t n_samples                # Number of samples
40 |     cdef SIZE_t n_node_samples           # Number of samples in the node (end-start)
41 |     cdef double weighted_n_samples       # Weighted number of samples (in total)
42 |     cdef double weighted_n_node_samples  # Weighted number of samples in the node
43 |     cdef double weighted_n_left          # Weighted number of samples in the left node
44 |     cdef double weighted_n_right         # Weighted number of samples in the right node
45 | 
46 |     cdef double* sum_total          # For classification criteria, the sum of the
47 |                                     # weighted count of each label. For regression,
48 |                                     # the sum of w*y. sum_total[k] is equal to
49 |                                     # sum_{i=start}^{end-1} w[samples[i]]*y[samples[i], k],
50 |                                     # where k is output index.
51 |     cdef double* sum_left           # Same as above, but for the left side of the split
52 |     cdef double* sum_right          # same as above, but for the right side of the split
53 | 
54 |     # The criterion object is maintained such that left and right collected
55 |     # statistics correspond to samples[start:pos] and samples[pos:end].
56 | 
57 |     # Methods
58 |     cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
59 |                   double weighted_n_samples, SIZE_t* samples, SIZE_t start,
60 |                   SIZE_t end) except -1 nogil
61 |     cdef int reset(self) except -1 nogil
62 |     cdef int reverse_reset(self) except -1 nogil
63 |     cdef int update(self, SIZE_t new_pos) except -1 nogil
64 |     cdef double node_impurity(self) nogil
65 |     cdef void children_impurity(self, double* impurity_left,
66 |                                 double* impurity_right) nogil
67 |     cdef void node_value(self, double* dest) nogil
68 |     cdef double impurity_improvement(self, double impurity) nogil
69 |     cdef double proxy_impurity_improvement(self) nogil
70 | 
71 | cdef class ClassificationCriterion(Criterion):
72 |     """Abstract criterion for classification."""
73 | 
74 |     cdef SIZE_t* n_classes
75 |     cdef SIZE_t sum_stride
76 | 
77 | cdef class RegressionCriterion(Criterion):
78 |     """Abstract regression criterion."""
79 | 
80 |     cdef double sq_sum_total
81 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/tree/_splitter.pxd:
--------------------------------------------------------------------------------
 1 | # cython: language_level=3
 2 | 
 3 | # Authors: Gilles Louppe <g.louppe@gmail.com>
 4 | #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 5 | #          Brian Holt <bdholt1@gmail.com>
 6 | #          Joel Nothman <joel.nothman@gmail.com>
 7 | #          Arnaud Joly <arnaud.v.joly@gmail.com>
 8 | #          Jacob Schreiber <jmschreiber91@gmail.com>
 9 | #
10 | # License: BSD 3 clause
11 | 
12 | # See _splitter.pyx for details.
13 | 
14 | import numpy as np
15 | cimport numpy as np
16 | 
17 | from ._utils cimport SplitValue, SplitRecord
18 | 
19 | from ._criterion cimport Criterion
20 | 
21 | ctypedef np.npy_float32 DTYPE_t          # Type of X
22 | ctypedef np.npy_float64 DOUBLE_t         # Type of y, sample_weight
23 | ctypedef np.npy_intp SIZE_t              # Type for indices and counters
24 | ctypedef np.npy_int32 INT32_t            # Signed 32 bit integer
25 | ctypedef np.npy_uint32 UINT32_t          # Unsigned 32 bit integer
26 | ctypedef np.npy_uint64 UINT64_t          # Unsigned 64 bit integer
27 | 
28 | cdef class Splitter:
29 |     # The splitter searches in the input space for a feature and a threshold
30 |     # to split the samples samples[start:end].
31 |     #
32 |     # The impurity computations are delegated to a criterion object.
33 | 
34 |     # Internal structures
35 |     cdef public Criterion criterion      # Impurity criterion
36 |     cdef public SIZE_t max_features      # Number of features to test
37 |     cdef public SIZE_t min_samples_leaf  # Min samples in a leaf
38 |     cdef public double min_weight_leaf   # Minimum weight in a leaf
39 | 
40 |     cdef object random_state             # Random state
41 |     cdef UINT32_t rand_r_state           # sklearn_rand_r random number state
42 | 
43 |     cdef SIZE_t* samples                 # Sample indices in X, y
44 |     cdef SIZE_t n_samples                # X.shape[0]
45 |     cdef double weighted_n_samples       # Weighted number of samples
46 |     cdef SIZE_t* features                # Feature indices in X
47 |     cdef SIZE_t* constant_features       # Constant features indices
48 |     cdef SIZE_t n_features               # X.shape[1]
49 |     cdef DTYPE_t* feature_values         # temp. array holding feature values
50 | 
51 |     cdef SIZE_t start                    # Start position for the current node
52 |     cdef SIZE_t end                      # End position for the current node
53 | 
54 |     cdef bint presort                    # Whether to use presorting, only
55 |                                          # allowed on dense data
56 |     cdef bint breiman_shortcut           # Whether decision trees are allowed to use the
57 |                                          # Breiman shortcut for categorical features
58 | 
59 |     cdef DOUBLE_t* y
60 |     cdef SIZE_t y_stride
61 |     cdef DOUBLE_t* sample_weight
62 |     cdef INT32_t *n_categories           # (n_features,) array giving number of
63 |                                          # categories (<0 for non-categorical)
64 |     cdef UINT32_t* cat_cache             # Cache buffer for fast categorical split evaluation
65 | 
66 |     # The samples vector `samples` is maintained by the Splitter object such
67 |     # that the samples contained in a node are contiguous. With this setting,
68 |     # `node_split` reorganizes the node samples `samples[start:end]` in two
69 |     # subsets `samples[start:pos]` and `samples[pos:end]`.
70 | 
71 |     # The 1-d  `features` array of size n_features contains the features
72 |     # indices and allows fast sampling without replacement of features.
73 | 
74 |     # The 1-d `constant_features` array of size n_features holds in
75 |     # `constant_features[:n_constant_features]` the feature ids with
76 |     # constant values for all the samples that reached a specific node.
77 |     # The value `n_constant_features` is given by the parent node to its
78 |     # child nodes.  The content of the range `[n_constant_features:]` is left
79 |     # undefined, but preallocated for performance reasons
80 |     # This allows optimization with depth-based tree building.
81 | 
82 |     # Methods
83 |     cdef int init(self, object X, np.ndarray y,
84 |                   DOUBLE_t* sample_weight,
85 |                   INT32_t* n_categories,
86 |                   np.ndarray X_idx_sorted=*) except -1
87 | 
88 |     cdef int node_reset(self, SIZE_t start, SIZE_t end,
89 |                         double* weighted_n_node_samples) except -1 nogil
90 | 
91 |     cdef int node_split(self,
92 |                         double impurity,   # Impurity of the node
93 |                         SplitRecord* split,
94 |                         SIZE_t* n_constant_features) except -1 nogil
95 | 
96 |     cdef void node_value(self, double* dest) nogil
97 | 
98 |     cdef double node_impurity(self) nogil
99 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/tree/_tree.pxd:
--------------------------------------------------------------------------------
  1 | # cython: language_level=3
  2 | 
  3 | # Authors: Gilles Louppe <g.louppe@gmail.com>
  4 | #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
  5 | #          Brian Holt <bdholt1@gmail.com>
  6 | #          Joel Nothman <joel.nothman@gmail.com>
  7 | #          Arnaud Joly <arnaud.v.joly@gmail.com>
  8 | #          Jacob Schreiber <jmschreiber91@gmail.com>
  9 | #          Nelson Liu <nelson@nelsonliu.me>
 10 | #
 11 | # License: BSD 3 clause
 12 | 
 13 | # See _tree.pyx for details.
 14 | 
 15 | import numpy as np
 16 | cimport numpy as np
 17 | 
 18 | ctypedef np.npy_float32 DTYPE_t          # Type of X
 19 | ctypedef np.npy_float64 DOUBLE_t         # Type of y, sample_weight
 20 | ctypedef np.npy_intp SIZE_t              # Type for indices and counters
 21 | ctypedef np.npy_int32 INT32_t            # Signed 32 bit integer
 22 | ctypedef np.npy_uint32 UINT32_t          # Unsigned 32 bit integer
 23 | 
 24 | from ._utils cimport SplitValue
 25 | from ._utils cimport SplitRecord
 26 | from ._utils cimport Node
 27 | from ._splitter cimport Splitter
 28 | 
 29 | 
 30 | cdef class CategoryCacheMgr:
 31 |     # Class to manage the category cache memory during Tree.apply()
 32 | 
 33 |     cdef SIZE_t n_nodes
 34 |     cdef UINT32_t **bits
 35 | 
 36 |     cdef void populate(self, Node *nodes, SIZE_t n_nodes, INT32_t *n_categories)
 37 | 
 38 | 
 39 | cdef class Tree:
 40 |     # The Tree object is a binary tree structure constructed by the
 41 |     # TreeBuilder. The tree structure is used for predictions and
 42 |     # feature importances.
 43 | 
 44 |     # Input/Output layout
 45 |     cdef public SIZE_t n_features        # Number of features in X
 46 |     cdef SIZE_t* n_classes               # Number of classes in y[:, k]
 47 |     cdef public SIZE_t n_outputs         # Number of outputs in y
 48 |     cdef public SIZE_t max_n_classes     # max(n_classes)
 49 | 
 50 |     # Inner structures: values are stored separately from node structure,
 51 |     # since size is determined at runtime.
 52 |     cdef public SIZE_t max_depth         # Max depth of the tree
 53 |     cdef public SIZE_t node_count        # Counter for node IDs
 54 |     cdef public SIZE_t capacity          # Capacity of tree, in terms of nodes
 55 |     cdef Node* nodes                     # Array of nodes
 56 |     cdef double* value                   # (capacity, n_outputs, max_n_classes) array of values
 57 |     cdef SIZE_t value_stride             # = n_outputs * max_n_classes
 58 |     cdef INT32_t *n_categories           # (n_features,) array giving number of
 59 |                                          # categories (<0 for non-categorical)
 60 | 
 61 |     # Methods
 62 |     cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
 63 |                           SIZE_t feature, SplitValue split_value, double impurity,
 64 |                           SIZE_t n_node_samples,
 65 |                           double weighted_n_samples) except -1 nogil
 66 |     cdef int _resize(self, SIZE_t capacity) except -1 nogil
 67 |     cdef int _resize_c(self, SIZE_t capacity=*) except -1 nogil
 68 | 
 69 |     cdef np.ndarray _get_value_ndarray(self)
 70 |     cdef np.ndarray _get_node_ndarray(self)
 71 | 
 72 |     cpdef np.ndarray predict(self, object X)
 73 | 
 74 |     cpdef np.ndarray apply(self, object X)
 75 |     cdef np.ndarray _apply_dense(self, object X)
 76 |     cdef np.ndarray _apply_sparse_csr(self, object X)
 77 | 
 78 |     cpdef object decision_path(self, object X)
 79 |     cdef object _decision_path_dense(self, object X)
 80 |     cdef object _decision_path_sparse_csr(self, object X)
 81 | 
 82 |     cpdef compute_feature_importances(self, normalize=*)
 83 | 
 84 | 
 85 | # =============================================================================
 86 | # Tree builder
 87 | # =============================================================================
 88 | 
 89 | cdef class TreeBuilder:
 90 |     # The TreeBuilder recursively builds a Tree object from training samples,
 91 |     # using a Splitter object for splitting internal nodes and assigning
 92 |     # values to leaves.
 93 |     #
 94 |     # This class controls the various stopping criteria and the node splitting
 95 |     # evaluation order, e.g. depth-first or best-first.
 96 | 
 97 |     cdef Splitter splitter              # Splitting algorithm
 98 | 
 99 |     cdef SIZE_t min_samples_split       # Minimum number of samples in an internal node
100 |     cdef SIZE_t min_samples_leaf        # Minimum number of samples in a leaf
101 |     cdef double min_weight_leaf         # Minimum weight in a leaf
102 |     cdef SIZE_t max_depth               # Maximal tree depth
103 |     cdef double min_impurity_split
104 |     cdef double min_impurity_decrease   # Impurity threshold for early stopping
105 | 
106 |     cpdef build(self, Tree tree, object X, np.ndarray y,
107 |                 np.ndarray sample_weight=*,
108 |                 np.ndarray n_categories=*,
109 |                 np.ndarray X_idx_sorted=*)
110 |     cdef _check_input(self, object X, np.ndarray y, np.ndarray sample_weight)
111 | 


--------------------------------------------------------------------------------
/sklearn_pmml_model/tree/quad_tree.pxd:
--------------------------------------------------------------------------------
  1 | # cython: boundscheck=False
  2 | # cython: wraparound=False
  3 | # cython: cdivision=True
  4 | # Author: Thomas Moreau <thomas.moreau.2010@gmail.com>
  5 | # Author: Olivier Grisel <olivier.grisel@ensta.fr>
  6 | 
  7 | # See quad_tree.pyx for details.
  8 | 
  9 | import numpy as np
 10 | cimport numpy as np
 11 | 
 12 | ctypedef np.npy_float32 DTYPE_t          # Type of X
 13 | ctypedef np.npy_intp SIZE_t              # Type for indices and counters
 14 | ctypedef np.npy_int32 INT32_t            # Signed 32 bit integer
 15 | ctypedef np.npy_uint32 UINT32_t          # Unsigned 32 bit integer
 16 | 
 17 | # This is effectively an ifdef statement in Cython
 18 | # It allows us to write printf debugging lines
 19 | # and remove them at compile time
 20 | cdef enum:
 21 |     DEBUGFLAG = 0
 22 | 
 23 | cdef float EPSILON = 1e-6
 24 | 
 25 | # XXX: Careful to not change the order of the arguments. It is important to
 26 | # have is_leaf and max_width consecutive as it permits to avoid padding by
 27 | # the compiler and keep the size coherent for both C and numpy data structures.
 28 | cdef struct Cell:
 29 |     # Base storage structure for cells in a QuadTree object
 30 | 
 31 |     # Tree structure
 32 |     SIZE_t parent              # Parent cell of this cell
 33 |     SIZE_t[8] children         # Array pointing to childrens of this cell
 34 |     
 35 |     # Cell description
 36 |     SIZE_t cell_id             # Id of the cell in the cells array in the Tree
 37 |     SIZE_t point_index         # Index of the point at this cell (only defined
 38 |                                # in non empty leaf)
 39 |     bint is_leaf               # Does this cell have children?
 40 |     DTYPE_t squared_max_width  # Squared value of the maximum width w
 41 |     SIZE_t depth               # Depth of the cell in the tree
 42 |     SIZE_t cumulative_size     # Number of points included in the subtree with
 43 |                                # this cell as a root.
 44 | 
 45 |     # Internal constants
 46 |     DTYPE_t[3] center          # Store the center for quick split of cells
 47 |     DTYPE_t[3] barycenter      # Keep track of the center of mass of the cell
 48 | 
 49 |     # Cell boundaries
 50 |     DTYPE_t[3] min_bounds      # Inferior boundaries of this cell (inclusive)
 51 |     DTYPE_t[3] max_bounds      # Superior boundaries of this cell (exclusive)
 52 | 
 53 | 
 54 | cdef class _QuadTree:
 55 |     # The QuadTree object is a quad tree structure constructed by inserting
 56 |     # recursively points in the tree and splitting cells in 4 so that each
 57 |     # leaf cell contains at most one point.
 58 |     # This structure also handle 3D data, inserted in trees with 8 children
 59 |     # for each node.
 60 | 
 61 |     # Parameters of the tree
 62 |     cdef public int n_dimensions         # Number of dimensions in X
 63 |     cdef public int verbose              # Verbosity of the output
 64 |     cdef SIZE_t n_cells_per_cell         # Number of children per node. (2 ** n_dimension)
 65 | 
 66 |     # Tree inner structure
 67 |     cdef public SIZE_t max_depth         # Max depth of the tree
 68 |     cdef public SIZE_t cell_count        # Counter for node IDs
 69 |     cdef public SIZE_t capacity          # Capacity of tree, in terms of nodes
 70 |     cdef public SIZE_t n_points          # Total number of points
 71 |     cdef Cell* cells                     # Array of nodes
 72 | 
 73 |     # Point insertion methods
 74 |     cdef int insert_point(self, DTYPE_t[3] point, SIZE_t point_index,
 75 |                           SIZE_t cell_id=*) except -1 nogil
 76 |     cdef SIZE_t _insert_point_in_new_child(self, DTYPE_t[3] point, Cell* cell,
 77 |                                            SIZE_t point_index, SIZE_t size=*
 78 |                                            ) nogil
 79 |     cdef SIZE_t _select_child(self, DTYPE_t[3] point, Cell* cell) nogil
 80 |     cdef bint _is_duplicate(self, DTYPE_t[3] point1, DTYPE_t[3] point2) nogil
 81 | 
 82 |     # Create a summary of the Tree compare to a query point
 83 |     cdef long summarize(self, DTYPE_t[3] point, DTYPE_t* results,
 84 |                         float squared_theta=*, SIZE_t cell_id=*, long idx=*
 85 |                         ) noexcept nogil
 86 | 
 87 |     # Internal cell initialization methods
 88 |     cdef void _init_cell(self, Cell* cell, SIZE_t parent, SIZE_t depth) nogil
 89 |     cdef void _init_root(self, DTYPE_t[3] min_bounds, DTYPE_t[3] max_bounds
 90 |                          ) nogil
 91 | 
 92 |     # Private methods
 93 |     cdef int _check_point_in_cell(self, DTYPE_t[3] point, Cell* cell
 94 |                                   ) except -1 nogil
 95 | 
 96 |     # Private array manipulation to manage the ``cells`` array
 97 |     cdef int _resize(self, SIZE_t capacity) except -1 nogil
 98 |     cdef int _resize_c(self, SIZE_t capacity=*) except -1 nogil
 99 |     cdef int _get_cell(self, DTYPE_t[3] point, SIZE_t cell_id=*) except -1 nogil
100 |     cdef np.ndarray _get_cell_ndarray(self)
101 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamDecode/sklearn-pmml-model/13d992034c29847aa6ed3e377c0eaa5b3366d7cd/tests/__init__.py


--------------------------------------------------------------------------------
/tests/naive_bayes/test_naive_bayes.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | import sklearn_pmml_model
  3 | from sklearn_pmml_model.naive_bayes import PMMLGaussianNB
  4 | from sklearn.naive_bayes import GaussianNB
  5 | from sklearn.datasets import load_wine
  6 | import pandas as pd
  7 | import numpy as np
  8 | from os import path, remove
  9 | from io import StringIO
 10 | from sklearn2pmml.pipeline import PMMLPipeline
 11 | from sklearn2pmml import sklearn2pmml
 12 | 
 13 | 
 14 | BASE_DIR = path.dirname(sklearn_pmml_model.__file__)
 15 | 
 16 | 
 17 | class TestNaiveBayes(TestCase):
 18 |   def test_invalid_model(self):
 19 |     with self.assertRaises(Exception) as cm:
 20 |       PMMLGaussianNB(pmml=StringIO("""
 21 |               <PMML xmlns="http://www.dmg.org/PMML-4_4" version="4.4">
 22 |                 <DataDictionary>
 23 |                   <DataField name="Class" optype="categorical" dataType="string">
 24 |                     <Value value="setosa"/>
 25 |                     <Value value="versicolor"/>
 26 |                     <Value value="virginica"/>
 27 |                   </DataField>
 28 |                 </DataDictionary>
 29 |                 <MiningSchema>
 30 |                   <MiningField name="Class" usageType="target"/>
 31 |                 </MiningSchema>
 32 |               </PMML>
 33 |               """))
 34 | 
 35 |     assert str(cm.exception) == 'PMML model does not contain NaiveBayesModel.'
 36 | 
 37 |   def test_unsupported_distribution(self):
 38 |     with self.assertRaises(Exception) as cm:
 39 |       PMMLGaussianNB(pmml=StringIO("""
 40 |                 <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
 41 |                   <DataDictionary>
 42 |                     <DataField name="Class" optype="categorical" dataType="string">
 43 |                       <Value value="setosa"/>
 44 |                       <Value value="versicolor"/>
 45 |                       <Value value="virginica"/>
 46 |                     </DataField>
 47 |                     <DataField name="a" optype="continuous" dataType="double"/>
 48 |                   </DataDictionary>
 49 |                   <NaiveBayesModel>
 50 |                     <MiningSchema>
 51 |                       <MiningField name="Class" usageType="target"/>
 52 |                     </MiningSchema>
 53 |                     <BayesInputs>
 54 |                       <BayesInput fieldName="a">
 55 |                         <TargetValueStats>
 56 |                           <TargetValueStat value="setosa">
 57 |                             <PoissonDistribution mean="2.80188679245283"/>
 58 |                           </TargetValueStat>
 59 |                         </TargetValueStats>
 60 |                       </BayesInput>
 61 |                     </BayesInputs>
 62 |                   </NaiveBayesModel>
 63 |                 </PMML>
 64 |                 """))
 65 | 
 66 |     assert str(cm.exception) == 'Distribution "PoissonDistribution" not implemented, or not supported by scikit-learn'
 67 | 
 68 |   def test_more_tags(self):
 69 |     clf = PMMLGaussianNB(path.join(BASE_DIR, '../models/nb-cat-pima.pmml'))
 70 |     assert clf._more_tags() == GaussianNB()._more_tags()
 71 | 
 72 | 
 73 | class TestGaussianNBIntegration(TestCase):
 74 |   def setUp(self):
 75 |     df = pd.read_csv(path.join(BASE_DIR, '../models/categorical-test.csv'))
 76 |     Xte = df.iloc[:, 1:]
 77 |     Xenc = pd.get_dummies(Xte, prefix_sep='')
 78 |     yte = df.iloc[:, 0]
 79 |     self.test = (Xte, yte)
 80 |     self.enc = (Xenc, yte)
 81 | 
 82 |     pmml = path.join(BASE_DIR, '../models/nb-cat-pima.pmml')
 83 |     self.clf = PMMLGaussianNB(pmml)
 84 | 
 85 |     self.ref = GaussianNB()
 86 |     self.ref.fit(Xenc, yte)
 87 | 
 88 |   def test_predict_proba(self):
 89 |     Xte, _ = self.test
 90 |     ref = np.array([0.089665518, 0.229009345, 0.007881006, 0.025306284, 0.013287187, 0.085741556, 0.338780868, 0.063463670, 0.769219497, 0.100369704, 0.002308186, 0.050380836, 0.054716302, 0.114718523, 0.156496072, 0.076301905, 0.806474996, 0.001227284, 0.121921194, 0.146751623, 0.074212037, 0.084148702, 0.479980587, 0.234470483, 0.354876655, 0.480582547, 0.113901660, 0.969566830, 0.989918477, 0.760519487, 0.599039599, 0.997856475, 0.776102648, 0.863233887, 0.910001902, 0.846005607, 0.734269347, 0.841546008, 0.120615475, 0.457027577, 0.124201960, 0.882691224, 0.930458760, 0.585210046, 0.484105369, 0.697949034, 0.778448666, 0.820806942, 0.074380668, 0.978478762, 0.589284915, 0.586728917])
 91 |     assert np.allclose(ref, self.clf.predict_proba(Xte)[:, 0])
 92 | 
 93 |   def test_predict(self):
 94 |     Xte, _ = self.test
 95 |     ref = np.array(['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No'])
 96 |     assert all(ref == self.clf.predict(Xte))
 97 | 
 98 |   def test_score(self):
 99 |     Xte, yte = self.test
100 |     ref = 0.8461538462
101 |     assert np.allclose(ref, self.clf.score(Xte, yte))
102 | 
103 |   def test_fit_exception(self):
104 |     with self.assertRaises(Exception) as cm:
105 |       self.clf.fit(np.array([[]]), np.array([]))
106 | 
107 |     assert str(cm.exception) == 'Not supported.'
108 | 
109 |   def test_sklearn2pmml(self):
110 |     # Export to PMML
111 |     pipeline = PMMLPipeline([
112 |       ("classifier", self.ref)
113 |     ])
114 |     pipeline.fit(self.enc[0], self.enc[1])
115 |     sklearn2pmml(pipeline, "gnb-sklearn2pmml.pmml", with_repr = True)
116 | 
117 |     try:
118 |       # Import PMML
119 |       model = PMMLGaussianNB(pmml='gnb-sklearn2pmml.pmml')
120 | 
121 |       # Verify classification
122 |       Xenc, _ = self.enc
123 |       assert np.allclose(
124 |         self.ref.predict_proba(Xenc),
125 |         model.predict_proba(Xenc)
126 |       )
127 | 
128 |     finally:
129 |       remove("gnb-sklearn2pmml.pmml")
130 | 
131 | 
132 | class TestGaussianNBWineIntegration(TestCase):
133 |   def setUp(self):
134 |     df = load_wine(as_frame=True)
135 |     Xte = df.data
136 |     yte = df.target
137 |     self.test = (Xte, yte)
138 | 
139 |     self.ref = GaussianNB()
140 |     self.ref.fit(Xte, yte)
141 | 
142 |   def test_sklearn2pmml(self):
143 |     # Export to PMML
144 |     pipeline = PMMLPipeline([
145 |       ("classifier", self.ref)
146 |     ])
147 |     pipeline.fit(self.test[0], self.test[1])
148 |     sklearn2pmml(pipeline, "gnb-sklearn2pmml.pmml", with_repr = True)
149 | 
150 |     try:
151 |       # Import PMML
152 |       model = PMMLGaussianNB(pmml='gnb-sklearn2pmml.pmml')
153 | 
154 |       # Verify classification
155 |       Xte, _ = self.test
156 |       assert np.allclose(
157 |         self.ref.predict_proba(Xte),
158 |         model.predict_proba(Xte)
159 |       )
160 | 
161 |     finally:
162 |       remove("gnb-sklearn2pmml.pmml")
163 | 


--------------------------------------------------------------------------------
/tests/neighbors/test_knn.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | import sklearn_pmml_model
  3 | from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
  4 | from sklearn_pmml_model.neighbors import PMMLKNeighborsClassifier, PMMLKNeighborsRegressor
  5 | import pandas as pd
  6 | import numpy as np
  7 | from os import path, remove
  8 | from io import StringIO
  9 | from sklearn2pmml.pipeline import PMMLPipeline
 10 | from sklearn2pmml import sklearn2pmml
 11 | 
 12 | 
 13 | BASE_DIR = path.dirname(sklearn_pmml_model.__file__)
 14 | 
 15 | 
 16 | class TestKNearestNeighbors(TestCase):
 17 |   def test_invalid_model(self):
 18 |     with self.assertRaises(Exception) as cm:
 19 |       PMMLKNeighborsClassifier(pmml=StringIO("""
 20 |               <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
 21 |                 <DataDictionary>
 22 |                   <DataField name="Class" optype="categorical" dataType="string">
 23 |                     <Value value="setosa"/>
 24 |                     <Value value="versicolor"/>
 25 |                     <Value value="virginica"/>
 26 |                   </DataField>
 27 |                 </DataDictionary>
 28 |                 <MiningSchema>
 29 |                   <MiningField name="Class" usageType="target"/>
 30 |                 </MiningSchema>
 31 |               </PMML>
 32 |               """))
 33 | 
 34 |     assert str(cm.exception) == 'PMML model does not contain NearestNeighborModel.'
 35 | 
 36 |   def test_no_distance_metric(self):
 37 |     with self.assertRaises(Exception) as cm:
 38 |       PMMLKNeighborsClassifier(pmml=StringIO("""
 39 |               <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
 40 |                 <DataDictionary>
 41 |                   <DataField name="Class" optype="categorical" dataType="string">
 42 |                     <Value value="setosa"/>
 43 |                     <Value value="versicolor"/>
 44 |                     <Value value="virginica"/>
 45 |                   </DataField>
 46 |                 </DataDictionary>
 47 |                 <MiningSchema>
 48 |                   <MiningField name="Class" usageType="target"/>
 49 |                 </MiningSchema>
 50 |                 <NearestNeighborModel numberOfNeighbors="5"/>
 51 |               </PMML>
 52 |               """))
 53 | 
 54 |     assert str(cm.exception) == 'PMML model does not contain ComparisonMeasure.'
 55 | 
 56 |   def test_unsupported_distance_metric(self):
 57 |     with self.assertRaises(Exception) as cm:
 58 |       PMMLKNeighborsClassifier(pmml=StringIO("""
 59 |               <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
 60 |                 <DataDictionary>
 61 |                   <DataField name="Class" optype="categorical" dataType="string">
 62 |                     <Value value="setosa"/>
 63 |                     <Value value="versicolor"/>
 64 |                     <Value value="virginica"/>
 65 |                   </DataField>
 66 |                 </DataDictionary>
 67 |                 <MiningSchema>
 68 |                   <MiningField name="Class" usageType="target"/>
 69 |                 </MiningSchema>
 70 |                 <NearestNeighborModel numberOfNeighbors="5">
 71 |                   <ComparisonMeasure>
 72 |                     <funkydistance/>
 73 |                   </ComparisonMeasure>
 74 |                 </NearestNeighborModel>
 75 |               </PMML>
 76 |               """))
 77 | 
 78 |     assert str(cm.exception) == 'PMML model uses unsupported distance metric: "funkydistance".'
 79 | 
 80 | 
 81 | class TestKNeighborsClassifierIntegration(TestCase):
 82 |   def setUp(self):
 83 |     df = pd.read_csv(path.join(BASE_DIR, '../models/categorical-test.csv'))
 84 |     cats = np.unique(df['age'])
 85 |     df['age'] = pd.Categorical(df['age'], categories=cats).codes + 1
 86 |     Xte = df.iloc[:, 1:]
 87 |     yte = df.iloc[:, 0]
 88 |     self.test = (Xte, yte)
 89 | 
 90 |     pmml = path.join(BASE_DIR, '../models/knn-clf-pima.pmml')
 91 |     self.clf = PMMLKNeighborsClassifier(pmml)
 92 | 
 93 |   def test_predict(self):
 94 |     Xte, yte = self.test
 95 |     ref = np.array(['Yes','No','Yes','Yes','Yes','Yes','Yes','Yes','No','Yes','Yes','Yes','Yes','Yes','Yes','Yes','No','Yes','Yes','Yes','Yes','Yes','No','Yes','Yes','Yes','Yes','No','No','No','Yes','No','No','No','No','No','No','No','Yes','No','No','No','No','No','No','Yes','No','No','No','No','Yes','Yes'])
 96 |     assert np.array_equal(ref, np.array(self.clf.predict(Xte)))
 97 | 
 98 |   def test_score(self):
 99 |     Xte, yte = self.test
100 |     ref = 0.807692307692307
101 |     assert np.allclose(ref, self.clf.score(Xte, yte))
102 | 
103 |   def test_fit_exception(self):
104 |     with self.assertRaises(Exception) as cm:
105 |       self.clf.fit(np.array([[]]), np.array([]))
106 | 
107 |     assert str(cm.exception) == 'Not supported.'
108 | 
109 |   def test_more_tags(self):
110 |     assert self.clf._more_tags() == {'requires_y': True, **KNeighborsClassifier()._more_tags()}
111 | 
112 |   def test_sklearn2pmml(self):
113 |     X, y = self.test
114 |     ref = KNeighborsClassifier(n_neighbors=11)
115 |     ref.fit(X, y)
116 | 
117 |     # Export to PMML
118 |     pipeline = PMMLPipeline([
119 |       ("classifier", ref)
120 |     ])
121 |     pipeline.fit(self.test[0], self.test[1])
122 |     sklearn2pmml(pipeline, "knn-sklearn2pmml.pmml", with_repr = True)
123 | 
124 |     try:
125 |       # Import PMML
126 |       model = PMMLKNeighborsClassifier(pmml='knn-sklearn2pmml.pmml')
127 | 
128 |       assert np.allclose(
129 |         ref.predict_proba(X),
130 |         model.predict_proba(X)
131 |       )
132 | 
133 |     finally:
134 |       remove("knn-sklearn2pmml.pmml")
135 | 
136 |   def test_sklearn2pmml_manhattan(self):
137 |     X, y = self.test
138 |     ref = KNeighborsClassifier(metric='manhattan', n_neighbors=8)
139 |     ref.fit(X, y)
140 | 
141 |     # Export to PMML
142 |     pipeline = PMMLPipeline([
143 |       ("classifier", ref)
144 |     ])
145 |     pipeline.fit(self.test[0], self.test[1])
146 |     sklearn2pmml(pipeline, "knn-sklearn2pmml.pmml", with_repr = True)
147 | 
148 |     try:
149 |       # Import PMML
150 |       model = PMMLKNeighborsClassifier(pmml='knn-sklearn2pmml.pmml')
151 | 
152 |       assert np.allclose(
153 |         ref.predict_proba(X),
154 |         model.predict_proba(X)
155 |       )
156 | 
157 |     finally:
158 |       remove("knn-sklearn2pmml.pmml")
159 | 
160 | 
161 | class TestKNeighborsRegressorIntegration(TestCase):
162 |   def setUp(self):
163 |     df = pd.read_csv(path.join(BASE_DIR, '../models/categorical-test.csv'))
164 |     cats = np.unique(df['age'])
165 |     df['age'] = pd.Categorical(df['age'], categories=cats).codes + 1
166 |     Xte = df.iloc[:, 1:]
167 |     yte = df.iloc[:, 0]
168 |     self.test = (Xte, yte)
169 | 
170 |     pmml = path.join(BASE_DIR, '../models/knn-reg-pima.pmml')
171 |     self.clf = PMMLKNeighborsRegressor(pmml)
172 | 
173 |   def test_predict(self):
174 |     Xte, yte = self.test
175 |     ref = np.array([
176 |       0.7142857,
177 |       0.1428571,
178 |       0.7142857,
179 |       1.0000000,
180 |       0.8571429,
181 |       0.8571429,
182 |       0.7142857,
183 |       0.8571429,
184 |       0.2857143,
185 |       0.8571429,
186 |       0.8571429,
187 |       0.7142857,
188 |       0.8571429,
189 |       1.0000000,
190 |       0.8571429,
191 |       0.7142857,
192 |       0.1428571,
193 |       1.0000000,
194 |       0.5714286,
195 |       0.7142857,
196 |       0.8571429,
197 |       0.5714286,
198 |       0.4285714,
199 |       0.5714286,
200 |       1.0000000,
201 |       0.5714286,
202 |       0.8571429,
203 |       0.1428571,
204 |       0.1428571,
205 |       0.2857143,
206 |       0.7142857,
207 |       0.1428571,
208 |       0.2857143,
209 |       0.0000000,
210 |       0.1428571,
211 |       0.2857143,
212 |       0.1428571,
213 |       0.0000000,
214 |       0.8571429,
215 |       0.4285714,
216 |       0.2857143,
217 |       0.1428571,
218 |       0.1428571,
219 |       0.1428571,
220 |       0.1428571,
221 |       0.7142857,
222 |       0.0000000,
223 |       0.1428571,
224 |       0.2857143,
225 |       0.1428571,
226 |       0.7142857,
227 |       0.7142857,
228 |     ])
229 |     assert np.allclose(ref, np.array(self.clf.predict(Xte)))
230 | 
231 |   def test_score(self):
232 |     Xte, yte = self.test
233 |     ref = 0.383045525902668
234 |     assert np.allclose(ref, self.clf.score(Xte, (yte == 'Yes').astype(int)))
235 | 
236 |   def test_fit_exception(self):
237 |     with self.assertRaises(Exception) as cm:
238 |       self.clf.fit(np.array([[]]), np.array([]))
239 | 
240 |     assert str(cm.exception) == 'Not supported.'
241 | 
242 |   def test_more_tags(self):
243 |     assert self.clf._more_tags() == KNeighborsRegressor()._more_tags()
244 | 


--------------------------------------------------------------------------------
/tests/test_datatypes.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | from sklearn_pmml_model.datatypes import Category, Interval
 3 | 
 4 | 
 5 | class TestInterval(TestCase):
 6 |   def test_exception(self):
 7 |     with self.assertRaises(Exception) as cm:
 8 |       Interval(closure='openOpen')
 9 |     assert type(cm.exception) == AssertionError
10 | 
11 |     with self.assertRaises(Exception) as cm:
12 |       Interval('openOpen', 3, 0)
13 |     assert type(cm.exception) == AssertionError
14 | 
15 |     with self.assertRaises(Exception) as cm:
16 |       Interval('non_existing_closure', 0)
17 |     assert type(cm.exception) == AssertionError
18 | 
19 |   def test_contains(self):
20 |     interval = Interval('closedClosed', 1, 10)
21 | 
22 |     assert 2 in interval
23 |     assert 0 not in interval
24 |     assert 10.1 not in interval
25 | 
26 | 
27 | class TestCategory(TestCase):
28 |   def test_exception(self):
29 |     with self.assertRaises(Exception) as cm:
30 |       Category(str, categories="bad cats")
31 |     assert type(cm.exception) == AssertionError
32 | 
33 |     with self.assertRaises(Exception) as cm:
34 |       Category(str, [1, 2], ordered=1)
35 |     assert type(cm.exception) == AssertionError
36 | 
37 |   def test_contains(self):
38 |     categories = ['loud', 'louder', 'loudest']
39 |     cat_type = Category(str, categories, ordered=True)
40 | 
41 |     assert 'loud' in cat_type
42 |     assert 'bad' not in cat_type
43 | 
44 |   def test_callable(self):
45 |     categories = ['1', '2', '3']
46 |     cat_type = Category(int, categories, ordered=True)
47 | 
48 |     with self.assertRaises(Exception) as cm:
49 |       cat_type('4')
50 | 
51 |     assert str(cm.exception) == 'Invalid categorical value: 4'
52 |     assert isinstance(cat_type('1'), int)
53 |     assert cat_type('2') == 2
54 | 


--------------------------------------------------------------------------------