├── .circleci └── config.yml ├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── build.yaml │ ├── codeql.yml │ └── lint.yaml ├── .gitignore ├── .readthedocs.yaml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── benchmark.ipynb ├── docs ├── Makefile ├── _static │ └── custom.css ├── conf.py ├── contributing.md ├── index.rst ├── install.rst └── release_notes.rst ├── models ├── categorical-test.csv ├── gb-gbm-cat-pima-regression.pmml ├── gb-gbm-cat-pima.pmml ├── gb-xgboost-iris.pmml ├── knn-clf-pima.pmml ├── knn-reg-pima.pmml ├── linear-model-glm.pmml ├── linear-model-lasso.pmml ├── linear-model-lm.pmml ├── linear-model-lmc.pmml ├── linear-model-ridge.pmml ├── linear-model-ridgec.pmml ├── nb-cat-pima.pmml ├── nn-iris.pmml ├── nn-pima-regression.pmml ├── randomForest.pmml ├── rf-cat-pima-regression.pmml ├── rf-cat-pima.pmml ├── rf-iris.pmml ├── svc-cat-pima.pmml ├── svr-cat-pima.pmml ├── tree-cat-pima-regression.pmml ├── tree-cat-pima.pmml ├── tree-cat.pmml ├── tree-digits.pmml └── tree-iris.pmml ├── pyproject.toml ├── requirements.txt ├── setup.py ├── sklearn_pmml_model ├── __init__.py ├── auto_detect │ ├── __init__.py │ └── base.py ├── base.py ├── datatypes.py ├── ensemble │ ├── README.md │ ├── __init__.py │ ├── _gradient_boosting.pyx │ ├── forest.py │ └── gb.py ├── linear_model │ ├── README.md │ ├── __init__.py │ ├── base.py │ └── implementations.py ├── naive_bayes │ ├── README.md │ ├── __init__.py │ └── implementations.py ├── neighbors │ ├── README.md │ ├── __init__.py │ ├── _base.py │ └── _classes.py ├── neural_network │ ├── README.md │ ├── __init__.py │ ├── _base.py │ └── _classes.py ├── svm │ ├── README.md │ ├── __init__.py │ ├── _base.py │ └── _classes.py └── tree │ ├── README.md │ ├── __init__.py │ ├── _criterion.pxd │ ├── _criterion.pyx │ ├── _splitter.pxd │ ├── _splitter.pyx │ ├── _tree.pxd │ ├── _tree.pyx │ ├── _utils.pxd │ ├── _utils.pyx │ ├── quad_tree.pxd │ ├── quad_tree.pyx │ └── tree.py └── tests ├── __init__.py ├── auto_detect └── test_auto_detect.py ├── ensemble ├── test_forest.py └── test_gb.py ├── linear_model └── test_linear_model.py ├── naive_bayes └── test_naive_bayes.py ├── neighbors └── test_knn.py ├── neural_network └── test_neural_network.py ├── svm └── test_svm.py ├── test_base.py ├── test_datatypes.py └── tree └── test_tree.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Python CircleCI 2.0 configuration file 2 | version: 2.1 3 | 4 | jobs: 5 | build: 6 | docker: 7 | # specify the version you desire here 8 | # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers` 9 | - image: circleci/python:3.9.2 10 | 11 | # Specify service dependencies here if necessary 12 | # CircleCI maintains a library of pre-built images 13 | # documented at https://circleci.com/docs/2.0/circleci-images/ 14 | # - image: circleci/postgres:9.4 15 | 16 | working_directory: ~/repo 17 | 18 | steps: 19 | - run: echo 'export PYTHONPATH=$HOME/repo' >> $BASH_ENV 20 | 21 | - checkout 22 | 23 | # Download and cache dependencies 24 | - restore_cache: 25 | keys: 26 | - v1-dependencies-{{ checksum "requirements.txt" }} 27 | # fallback to using the latest cache if no exact match is found 28 | - v1-dependencies- 29 | 30 | - run: 31 | name: install dependencies 32 | command: | 33 | python3 -m venv venv 34 | . venv/bin/activate 35 | pip install -r requirements.txt 36 | sudo pip install codecov 37 | 38 | - run: 39 | name: install openjdk 40 | command: | 41 | sudo apt update 42 | sudo apt install default-jdk 43 | 44 | - save_cache: 45 | paths: 46 | - ./venv 47 | key: v1-dependencies-{{ checksum "requirements.txt" }} 48 | 49 | - run: 50 | name: run tests 51 | command: | 52 | . venv/bin/activate 53 | python setup.py build_ext --inplace 54 | pytest --cov=./sklearn_pmml_model tests/ 55 | 56 | - run: 57 | name: coverage report 58 | command: codecov 59 | 60 | - store_artifacts: 61 | path: test-reports 62 | destination: test-reports 63 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E111,E121,E114 3 | D100,D105,D102 4 | per-file-ignores = 5 | __init__.py: D200,D205,D400 6 | docstring-convention = numpy 7 | max-line-length = 120 8 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | #### Description 11 | 14 | 15 | #### Steps/Code to Reproduce 16 | 25 | 26 | #### Expected Results 27 | 32 | 33 | #### Actual Results 34 | 38 | 39 | #### Versions 40 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | #### Description 11 | 18 | 19 | #### Additional context 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /.github/workflows/build.yaml: -------------------------------------------------------------------------------- 1 | name: Build and upload to PyPI 2 | 3 | on: 4 | push: 5 | pull_request: 6 | release: 7 | types: 8 | - published 9 | 10 | jobs: 11 | build_wheels: 12 | name: Build wheels on ${{ matrix.os }} 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | matrix: 16 | # macos-13 is an intel runner, macos-14 is apple silicon 17 | os: [windows-latest, macos-13, macos-14, ubuntu-latest] 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | 22 | - uses: actions/setup-python@v5 23 | name: Install Python 24 | with: 25 | python-version: '3.12' 26 | 27 | - name: Build wheels 28 | uses: pypa/cibuildwheel@v2.17.0 29 | 30 | - uses: actions/upload-artifact@v4 31 | with: 32 | name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} 33 | path: ./wheelhouse/*.whl 34 | 35 | build_sdist: 36 | name: Build source distribution 37 | runs-on: ubuntu-latest 38 | steps: 39 | - uses: actions/checkout@v4 40 | 41 | - uses: actions/setup-python@v5 42 | name: Install Python 43 | with: 44 | python-version: '3.12' 45 | 46 | - name: Install dependencies 47 | run: python -m pip install cython numpy setuptools 48 | 49 | - name: Build sdist 50 | run: python setup.py sdist 51 | 52 | - uses: actions/upload-artifact@v4 53 | with: 54 | name: cibw-sdist 55 | path: dist/*.tar.gz 56 | 57 | upload_pypi: 58 | name: Upload to PyPI 59 | needs: [build_wheels, build_sdist] 60 | runs-on: ubuntu-latest 61 | if: github.event_name == 'release' && github.event.action == 'published' 62 | steps: 63 | - uses: actions/download-artifact@v4 64 | with: 65 | pattern: cibw-* 66 | path: dist 67 | merge-multiple: true 68 | 69 | - uses: pypa/gh-action-pypi-publish@release/v1 70 | with: 71 | user: __token__ 72 | password: ${{ secrets.pypi_password }} 73 | skip_existing: true -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: [ 'master' ] 6 | pull_request: 7 | # The branches below must be a subset of the branches above 8 | branches: [ 'master' ] 9 | schedule: 10 | - cron: '52 16 * * 0' 11 | 12 | jobs: 13 | analyze: 14 | name: Analyze 15 | runs-on: ubuntu-latest 16 | permissions: 17 | actions: read 18 | contents: read 19 | security-events: write 20 | 21 | strategy: 22 | fail-fast: false 23 | matrix: 24 | language: [ 'python' ] 25 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 26 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 27 | 28 | steps: 29 | - name: Checkout repository 30 | uses: actions/checkout@v3 31 | 32 | # Initializes the CodeQL tools for scanning. 33 | - name: Initialize CodeQL 34 | uses: github/codeql-action/init@v2 35 | with: 36 | languages: ${{ matrix.language }} 37 | # If you wish to specify custom queries, you can do so here or in a config file. 38 | # By default, queries listed here will override any specified in a config file. 39 | # Prefix the list here with "+" to use these queries and those in the config file. 40 | 41 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 42 | queries: +security-and-quality 43 | 44 | 45 | # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). 46 | # If this step fails, then you should remove it and run the build manually (see below) 47 | - name: Autobuild 48 | uses: github/codeql-action/autobuild@v2 49 | 50 | # ℹ️ Command-line programs to run using the OS shell. 51 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 52 | 53 | # If the Autobuild fails above, remove it and uncomment the following three lines. 54 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 55 | 56 | # - run: | 57 | # echo "Run, Build Application using script" 58 | # ./location_of_script_within_repo/buildscript.sh 59 | 60 | - name: Perform CodeQL Analysis 61 | uses: github/codeql-action/analyze@v2 62 | with: 63 | category: "/language:${{matrix.language}}" 64 | -------------------------------------------------------------------------------- /.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | flake8: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v2 10 | 11 | - uses: actions/setup-python@v2 12 | name: Install Python 13 | with: 14 | python-version: '3.8' 15 | 16 | - name: Install flake8 17 | run: pip install flake8 flake8-docstrings 18 | 19 | - name: Run flake8 20 | run: flake8 sklearn_pmml_model 21 | 22 | commitlint: 23 | runs-on: ubuntu-latest 24 | steps: 25 | - uses: actions/checkout@v2 26 | with: 27 | fetch-depth: 0 28 | - uses: wagoid/commitlint-github-action@v4 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | *.c 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | 107 | # Intellij IDEA 108 | .idea/ 109 | *.iml 110 | 111 | models/generate_pmml.R -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Required 2 | version: 2 3 | 4 | # Set the version of Python and other tools you might need 5 | build: 6 | os: ubuntu-20.04 7 | tools: 8 | python: "3.9" 9 | 10 | # Build documentation in the docs/ directory with Sphinx 11 | sphinx: 12 | configuration: docs/conf.py 13 | 14 | 15 | # Optionally declare the Python requirements required to build your docs 16 | python: 17 | install: 18 | - requirements: requirements.txt -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | d.collaris@me.com. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | When contributing to this repository, please first discuss the change you wish to make via a GitHub issue, email, or any other method with the owners of this repository before making a change. 4 | 5 | Please note we have a [code of conduct](https://github.com/iamDecode/sklearn-pmml-model/blob/master/CODE_OF_CONDUCT.md), please follow it in all your interactions with the project. 6 | 7 | 8 | ## Scope of this package 9 | 10 | The scope of `sklearn-pmml-model` is to import functionality to all major estimator classes of the popular machine learning library [scikit-learn](https://scikit-learn.org) using [PMML](http://dmg.org/pmml/v4-4/GeneralStructure.html). 11 | 12 | The API is designed to closely resemble the `scikit-learn` API. The same directory and component structure is used, and each estimator is a sub-class of a corresponding estimator. Note that some models may not have a `scikit-learn` implementation (e.g., Bayesian networks) and hence cannot currently be represented. 13 | 14 | We intend for the library to remain as light-weight as possible, and stick with the minimum number of additions to enable PMML import functionality without affecting the outward facing API of estimators. 15 | 16 | 17 | ## Reporting bugs 18 | 19 | We use GitHub issues to track all bugs and feature requests; feel free to open an issue if you have found a bug or wish to see a feature implemented. 20 | 21 | It is recommended to check that your issue complies with the following rules before submitting: 22 | 23 | - Verify that your issue is not being currently addressed by other [issues](https://github.com/iamDecode/sklearn-pmml-model/issues) or [pull requests](https://github.com/iamDecode/sklearn-pmml-model/pulls). 24 | - Please include code snippets or error messages when reporting issues. When doing so, please make sure to format them using code blocks. See [Creating and highlighting code blocks](https://help.github.com/articles/creating-and-highlighting-code-blocks). 25 | - It can often be helpful to include your operating system type and version number, as well as your Python, sklearn-pmml-model, scikit-learn, numpy, and scipy versions. This information can be found by running the following code snippet: 26 | ```python 27 | import platform; print(platform.platform()) 28 | import sys; print("Python", sys.version) 29 | import numpy; print("NumPy", numpy.__version__) 30 | import scipy; print("SciPy", scipy.__version__) 31 | import sklearn; print("Scikit-Learn", sklearn.__version__) 32 | import sklearn_pmml_model; print("sklearn-pmml-model", sklearn_pmml_model.__version__) 33 | ``` 34 | 35 | 36 | ## Get a local copy 37 | 38 | These are the steps you need to take to create a copy of the `sklearn-pmml-model` repository on your computer. 39 | 40 | 1. [Create an account](https://github.com/join) on GitHub if you do not already have one. 41 | 42 | 2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [`sklearn-pmml-model` repository](https://github.com/iamDecode/sklearn-pmml-model). 43 | 44 | 3. Clone your fork of the `sklearn-pmml-model` repository from your GitHub account. Use a git GUI application (e.g., Sourcetree, GitKraken) or from command line, run: 45 | 46 | ``` 47 | $ git clone git@github.com:iamDecode/sklearn-pmml-model.git 48 | $ cd sklearn-pmml-model 49 | ``` 50 | 51 | 4. Create a feature branch to hold your development changes: 52 | 53 | ``` 54 | $ git checkout -b / 55 | ``` 56 | 57 | (For example: `decode/regression-trees`) 58 | 59 | 60 | ## Setting up a development environment 61 | 62 | After you created a copy of our main repository on GitHub, your need to setup a local development environment. We recommend creating a virtual environment and activating it: 63 | ``` 64 | $ python3 -m venv venv 65 | $ source venv/bin/activate 66 | ``` 67 | 68 | and install the dependencies within the virtual environment: 69 | 70 | ``` 71 | $ pip install -r requirements.txt 72 | ``` 73 | 74 | The final step is to build the Cython extensions (you need to rebuilt once you make changes to the Cython code): 75 | 76 | ``` 77 | $ python setup.py build_ext --inplace 78 | ``` 79 | 80 | ## Making changes to the code 81 | 82 | For pull requests to be accepted, your changes must at least meet the following requirements: 83 | 84 | 1. All changes related to *one feature* must belong to *one branch*. Each branch must be self-contained, with a single new feature or bugfix. 85 | 2. Commit messages should be formulated according to [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/). 86 | 3. If your pull request addresses an issue, please make sure to [link back](https://github.blog/changelog/2020-12-15-reference-issues-discussions-and-pull-requests-faster-with-multi-word-suggestions/) to the original issue. 87 | 4. Follow the [PEP8 style guide](https://www.python.org/dev/peps/pep-0008/). With the following exceptions or additions: 88 | - The max line length is 120 characters instead of 80. 89 | - Indents with double spaces, not 4 spaces or tabs. 90 | 91 | You can check for compliance locally by running: 92 | ``` 93 | $ flake8 sklearn_pmml_model 94 | ``` 95 | 5. Each function, class, method, and attribute needs to be documented using docstrings. `sklearn-pmml-model` conforms to the [numpy docstring standard](https://numpydoc.readthedocs.io/en/latest/format.html#docstring-standard). 96 | 6. Finally, ensure all the test cases still pass after you have made your changes. To test locally, you can run: 97 | ``` 98 | $ python setup.py pytest 99 | ``` 100 | 101 | In addition to these requirements, we strongly prefer you to consider the following guidelines. However, they are not strictly required to not be overly prohibitive to new contributors. 102 | 103 | 7. Your change should include test cases for all new functionality being introduced. 104 | 8. No additional code style issues should be reported by [LGTM](https://lgtm.com). 105 | 106 | Continuous integration will automatically verify compliance with all of the discussed requirements. 107 | 108 | 109 | 110 | ## Submitting a Pull Request 111 | 112 | 1. When you are done coding in your feature branch, [add changed or new files](https://git-scm.com/book/en/v2/Git-Basics-Recording-Changes-to-the-Repository#_tracking_files>): 113 | ``` 114 | $ git add path/to/modified_file 115 | ``` 116 | 2. Create a [commit](https://git-scm.com/book/en/v2/Git-Basics-Recording-Changes-to-the-Repository#_committing_changes) with a message describing what you changed. Commit messages should be formulated according to [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) standard: 117 | ``` 118 | $ git commit 119 | ``` 120 | 3. Push the changes to GitHub: 121 | ``` 122 | $ git push -u origin my_feature 123 | ``` 124 | 4. [Create a pull request](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request). -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2018, Dennis Collaris 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # sklearn-pmml-model 4 | 5 | [![PyPI version](https://badge.fury.io/py/sklearn-pmml-model.svg)](https://badge.fury.io/py/sklearn-pmml-model) 6 | [![codecov](https://codecov.io/gh/iamDecode/sklearn-pmml-model/branch/master/graph/badge.svg?token=CGbbgziGwn)](https://codecov.io/gh/iamDecode/sklearn-pmml-model) 7 | [![CircleCI](https://circleci.com/gh/iamDecode/sklearn-pmml-model.svg?style=shield)](https://circleci.com/gh/iamDecode/sklearn-pmml-model) 8 | [![ReadTheDocs](https://readthedocs.org/projects/sklearn-pmml-model/badge/?version=latest&style=flat)](https://sklearn-pmml-model.readthedocs.io/en/latest/) 9 | 10 | A library to effortlessly import models trained on different platforms and with programming languages into scikit-learn in Python. First export your model to [PMML](http://dmg.org/pmml/v4-3/GeneralStructure.html) (widely supported). Next, load the exported PMML file with this library, and use the class as any other scikit-learn estimator. 11 | 12 | 13 | ## Installation 14 | 15 | The easiest way is to use pip: 16 | 17 | ``` 18 | $ pip install sklearn-pmml-model 19 | ``` 20 | 21 | ## Status 22 | The library currently supports the following models: 23 | 24 | | Model | Classification | Regression | Categorical features | 25 | |--------------------------------------------------------|----------------|------------|----------------------| 26 | | [Decision Trees](sklearn_pmml_model/tree) | ✅ | ✅ | ✅1 | 27 | | [Random Forests](sklearn_pmml_model/ensemble) | ✅ | ✅ | ✅1 | 28 | | [Gradient Boosting](sklearn_pmml_model/ensemble) | ✅ | ✅ | ✅1 | 29 | | [Linear Regression](sklearn_pmml_model/linear_model) | ✅ | ✅ | ✅3 | 30 | | [Ridge](sklearn_pmml_model/linear_model) | ✅2 | ✅ | ✅3 | 31 | | [Lasso](sklearn_pmml_model/linear_model) | ✅2 | ✅ | ✅3 | 32 | | [ElasticNet](sklearn_pmml_model/linear_model) | ✅2 | ✅ | ✅3 | 33 | | [Gaussian Naive Bayes](sklearn_pmml_model/naive_bayes) | ✅ | | ✅3 | 34 | | [Support Vector Machines](sklearn_pmml_model/svm) | ✅ | ✅ | ✅3 | 35 | | [Nearest Neighbors](sklearn_pmml_model/neighbors) | ✅ | ✅ | | 36 | | [Neural Networks](sklearn_pmml_model/neural_network) | ✅ | ✅ | | 37 | 38 | 1 Categorical feature support using slightly modified internals, based on [scikit-learn#12866](https://github.com/scikit-learn/scikit-learn/pull/12866). 39 | 40 | 2 These models differ only in training characteristics, the resulting model is of the same form. Classification is supported using `PMMLLogisticRegression` for regression models and `PMMLRidgeClassifier` for general regression models. 41 | 42 | 3 By one-hot encoding categorical features automatically. 43 | 44 | ## Example 45 | A minimal working example (using [this PMML file](https://github.com/iamDecode/sklearn-pmml-model/blob/master/models/randomForest.pmml)) is shown below: 46 | 47 | ```python 48 | from sklearn.datasets import load_iris 49 | from sklearn.model_selection import train_test_split 50 | import pandas as pd 51 | import numpy as np 52 | from sklearn_pmml_model.ensemble import PMMLForestClassifier 53 | from sklearn_pmml_model.auto_detect import auto_detect_estimator 54 | 55 | # Prepare the data 56 | iris = load_iris() 57 | X = pd.DataFrame(iris.data) 58 | X.columns = np.array(iris.feature_names) 59 | y = pd.Series(np.array(iris.target_names)[iris.target]) 60 | y.name = "Class" 61 | Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.33, random_state=123) 62 | 63 | # Specify the model type for the least overhead... 64 | #clf = PMMLForestClassifier(pmml="models/randomForest.pmml") 65 | 66 | # ...or simply let the library auto-detect the model type 67 | clf = auto_detect_estimator(pmml="models/randomForest.pmml") 68 | 69 | # Use the model as any other scikit-learn model 70 | clf.predict(Xte) 71 | clf.score(Xte, yte) 72 | ``` 73 | 74 | More examples can be found in the subsequent packages: [tree](sklearn_pmml_model/tree), [ensemble](sklearn_pmml_model/ensemble), [linear_model](sklearn_pmml_model/linear_model), [naive_bayes](sklearn_pmml_model/naive_bayes), [svm](sklearn_pmml_model/svm), [neighbors](sklearn_pmml_model/neighbors) and [neural_network](sklearn_pmml_model/neural_network). 75 | 76 | ## Benchmark 77 | 78 | Depending on the data set and model, `sklearn-pmml-model` is between 1 and 10 times faster than competing libraries, by leveraging the optimization and industry-tested robustness of `sklearn`. Source code for this benchmark can be found in the corresponding [jupyter notebook](benchmark.ipynb). 79 | 80 | 81 | ### Running times (load + predict, in seconds) 82 | 83 | | | | Linear model | Naive Bayes | Decision tree | Random Forest | Gradient boosting | 84 | |---------------|---------------------|--------------|-------------|---------------|---------------|-------------------| 85 | | Wine | `PyPMML` | 0.013038 | 0.005674 | 0.005587 | 0.032734 | 0.034649 | 86 | | | `sklearn-pmml-model`| 0.00404 | 0.004059 | 0.000964 | 0.030008 | 0.032949 | 87 | | Breast cancer | `PyPMML` | 0.009838 | 0.01153 | 0.009367 | 0.058941 | 0.031196 | 88 | | | `sklearn-pmml-model`| 0.010749 | 0.008481 | 0.001106 | 0.044021 | 0.013411 | 89 | 90 | ### Improvement 91 | 92 | | | | Linear model | Naive Bayes | Decision tree | Random Forest | Gradient boosting | 93 | |---------------|--------------------|--------------|-------------|---------------|---------------|-------------------| 94 | | Wine | Improvement | 3.23× | 1.40× | 5.80× | 1.09× | 1.05× | 95 | | Breast cancer | Improvement | 0.91× | 1.36× | **8.47×** | 1.34× | 2.33× | 96 | 97 | *Benchmark ran on: 24 september 2024 17:19* 98 | 99 | ## Development 100 | 101 | ### Prerequisites 102 | 103 | Tests can be run using Py.test. Grab a local copy of the source: 104 | 105 | ``` 106 | $ git clone http://github.com/iamDecode/sklearn-pmml-model 107 | $ cd sklearn-pmml-model 108 | ``` 109 | 110 | create a virtual environment and activating it: 111 | ``` 112 | $ python3 -m venv venv 113 | $ source venv/bin/activate 114 | ``` 115 | 116 | and install the dependencies: 117 | 118 | ``` 119 | $ pip install -r requirements.txt 120 | ``` 121 | 122 | The final step is to build the Cython extensions: 123 | 124 | ``` 125 | $ python setup.py build_ext --inplace 126 | ``` 127 | 128 | ### Testing 129 | 130 | You can execute tests with py.test by running: 131 | ``` 132 | $ python setup.py pytest 133 | ``` 134 | 135 | ## Contributing 136 | 137 | Feel free to make a contribution. Please read [CONTRIBUTING.md](CONTRIBUTING.md) for more details. 138 | 139 | ## License 140 | 141 | This project is licensed under the BSD 2-Clause License - see the [LICENSE](LICENSE) file for details. 142 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_static/custom.css: -------------------------------------------------------------------------------- 1 | .navbar .container-xl { 2 | padding-left: 15px !important; 3 | padding-right: 15px !important; 4 | } 5 | 6 | #navbar-icon-links { 7 | display: none; 8 | } 9 | @media (min-width: 960px) { 10 | #navbar-icon-links { 11 | display: flex; 12 | } 13 | } 14 | 15 | .fas.pypi { 16 | font-size: 20px !important; 17 | margin-top: -2px; 18 | } 19 | 20 | .tile { 21 | position: relative; 22 | display: block; 23 | padding: 1rem 1.2rem; 24 | border-radius: 6px; 25 | box-shadow: 0 1px 5px rgba(0, 0, 0, 0.2) 26 | } 27 | 28 | .tile .tile-title { 29 | position: relative; 30 | font-size: 22px; 31 | line-height: 1.4; 32 | margin-top: 0; 33 | margin-bottom: .5rem; 34 | } 35 | 36 | .tile .tile-title .tile-icon { 37 | position: absolute; 38 | right: 0; 39 | line-height: 1.4; 40 | transition: all .2s; 41 | } 42 | 43 | .tile h3.tile-title { 44 | color: rgba(var(--pst-color-link),1); 45 | } 46 | 47 | .tile h3.tile-title:hover { 48 | color:rgba(var(--pst-color-link-hover),1); 49 | } 50 | 51 | .tile h3.tile-title:before { 52 | height: 0; 53 | margin: 0; 54 | } 55 | 56 | .tile .tile-desc { 57 | margin-top: 1rem; 58 | color: #646976 59 | } 60 | 61 | .tile .tile-desc p:last-child { 62 | margin-bottom: 0 63 | } 64 | 65 | .tile:after { 66 | content: ''; 67 | position: absolute; 68 | right: 0; 69 | bottom: 0; 70 | left: 0; 71 | height: 3px; 72 | background: #4ce8ff; 73 | background: linear-gradient(90deg, #4ce8ff, #d07cff); 74 | opacity: 0; 75 | transition: all .2s 76 | } 77 | 78 | .tile:hover:after { 79 | opacity: 1 80 | } 81 | 82 | .tile:hover .tile-icon { 83 | transform: scale(1.4) 84 | } 85 | 86 | a.tile-link:active, 87 | a.tile-link:link, 88 | a.tile-link:hover, 89 | a.tile-link:focus { 90 | text-decoration: none; 91 | } 92 | 93 | .tile .tile-desc pre { 94 | font-size: 80%; 95 | white-space: pre-line; 96 | } 97 | 98 | main > div > .reference.internal.image-reference img { 99 | margin-bottom: 25px; 100 | } 101 | 102 | .sig-prename { 103 | display: none; 104 | } 105 | 106 | .section > dl.py { 107 | margin-top: 70px; 108 | } -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file only contains a selection of the most common options. For a full 6 | # list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # import sys 17 | # sys.path.insert(0, os.path.abspath('.')) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'sklearn-pmml-model' 23 | current_year = datetime.utcnow().year 24 | copyright = f'2018 - {current_year}, Dennis Collaris' 25 | author = 'Dennis Collaris' 26 | 27 | # -- General configuration --------------------------------------------------- 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | extensions = [ 33 | 'autoapi.extension', 34 | 'numpydoc', 35 | 'sphinx_github_changelog', 36 | 'myst_parser', 37 | ] 38 | 39 | # Add any paths that contain templates here, relative to this directory. 40 | templates_path = ['_templates'] 41 | 42 | # List of patterns, relative to source directory, that match files and 43 | # directories to ignore when looking for source files. 44 | # This pattern also affects html_static_path and html_extra_path. 45 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 46 | 47 | # -- Options for HTML output ------------------------------------------------- 48 | 49 | # The theme to use for HTML and HTML Help pages. See the documentation for 50 | # a list of builtin themes. 51 | # 52 | html_theme = 'pydata_sphinx_theme' 53 | html_title = project 54 | 55 | html_theme_options = { 56 | "collapse_navigation": False, 57 | "icon_links": [ 58 | { 59 | "name": "GitHub", 60 | "url": "https://github.com/iamDecode/sklearn-pmml-model", 61 | "icon": "fab fa-github-square", 62 | }, 63 | { 64 | "name": "PyPI", 65 | "url": "https://pypi.org/project/sklearn-pmml-model", 66 | "icon": "fas fa-box pypi", 67 | }, 68 | ] 69 | } 70 | 71 | # Add any paths that contain custom static files (such as style sheets) here, 72 | # relative to this directory. They are copied after the builtin static files, 73 | # so a file named "default.css" will overwrite the builtin "default.css". 74 | html_static_path = ['_static'] 75 | 76 | master_doc = 'index' 77 | 78 | # -- AutoAPI ----------------------------------------------------------------- 79 | autoapi_type = 'python' 80 | autoapi_dirs = ['../sklearn_pmml_model'] 81 | 82 | 83 | def setup(app): 84 | app.add_css_file('custom.css') 85 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | ../CONTRIBUTING.md -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://user-images.githubusercontent.com/1223300/41346080-c2c910a0-6f05-11e8-89e9-71a72bb9543f.png 2 | :width: 300px 3 | :alt: sklearn-pmml-model 4 | 5 | Documentation 6 | ============= 7 | 8 | A library to effortlessly import models trained on different platforms and with programming languages into scikit-learn in Python. First export your model to PMML (widely supported). Next, load the exported PMML file with this library, and use the class as any other scikit-learn estimator. 9 | 10 | .. raw:: html 11 | 12 |
13 |
14 |
15 | 16 |

Install 17 | 18 |

19 |
20 |
21 |

The easiest way to install sklearn-pmml-model is to use pip by running:

22 |
23 |
24 |
$ pip install sklearn-pmml-model
25 |
26 |
27 | 28 |

Alternatively, you can install from source using the details described on GitHub.

29 |
30 |
31 |
32 | 44 |
45 | 46 | .. toctree:: 47 | :maxdepth: 4 48 | :caption: Contents: 49 | 50 | Install 51 | Contribute 52 | Release notes 53 | 54 | -------------------------------------------------------------------------------- /docs/install.rst: -------------------------------------------------------------------------------- 1 | Installing sklearn-pmml-model 2 | ============================= 3 | 4 | The easiest way to install :code:`sklearn-pmml-model` is with :ref:`install-pip`. Alternatively, you can install it :ref:`from source`. 5 | 6 | .. _install-pip: 7 | 8 | pip 9 | -------- 10 | 11 | Pre-built binary packages (wheels) are provided for Linux, MacOS, and Windows through PyPI. 12 | To install using :code:`pip`, simply run:: 13 | 14 | $ pip install sklearn-pmml-model 15 | 16 | More details on using `pip` can be found `here `_. 17 | 18 | .. _install-from-source: 19 | 20 | From source 21 | ----------- 22 | 23 | If you want to build :code:`sklearn-pmml-model` from source, you 24 | will need a C/C++ compiler to compile extensions. 25 | 26 | **Linux** 27 | 28 | On Linux, you need to install :code:`gcc`, which in most cases is available 29 | via your distribution's packaging system. 30 | Please follow your distribution's instructions on how to install packages. 31 | 32 | **MacOS** 33 | 34 | On MacOS, you need to install :code:`clang`, which is available from 35 | the *Command Line Tools* package. Open a terminal and execute:: 36 | 37 | $ xcode-select --install 38 | 39 | Alternatively, you can download it from the 40 | `Apple Developers page `_. 41 | Log in with your Apple ID, then search and download the 42 | *Command Line Tools for Xcode* package. 43 | 44 | **Windows** 45 | 46 | On Windows, the compiler you need depends on the Python version 47 | you are using. See `this guide `_ 48 | to determine which Microsoft Visual C++ compiler to use with a specific Python version. 49 | 50 | **Installing** 51 | 52 | Grab a local copy of the source:: 53 | 54 | $ git clone http://github.com/iamDecode/sklearn-pmml-model 55 | $ cd sklearn-pmml-model 56 | 57 | create a virtual environment and activate it:: 58 | 59 | $ python3 -m venv venv 60 | $ source venv/bin/activate 61 | 62 | and install the dependencies:: 63 | 64 | $ pip install -r requirements.txt 65 | 66 | The final step is to build the Cython extensions (this part requires the C/C++ compiler):: 67 | 68 | $ python setup.py build_ext --inplace 69 | 70 | 71 | .. _dependencies: 72 | 73 | Dependencies 74 | ------------ 75 | 76 | The current minimum dependencies to run :code:`sklearn-pmml-model` are: 77 | 78 | - numpy 1.16 or later 79 | - pandas 80 | - scikit-learn 81 | - scipy 82 | - cached-property -------------------------------------------------------------------------------- /docs/release_notes.rst: -------------------------------------------------------------------------------- 1 | Release notes 2 | ============= 3 | 4 | .. changelog:: 5 | :changelog-url: https://sklearn-pmml-model.readthedocs.io/en/latest/release_notes.html 6 | :github: https://github.com/iamDecode/sklearn-pmml-model/releases 7 | :pypi: https://pypi.org/project/sklearn-pmml-model -------------------------------------------------------------------------------- /models/categorical-test.csv: -------------------------------------------------------------------------------- 1 | "type","npreg","glu","bp","skin","bmi","ped","age" 2 | "Yes",2,128,78,37,43.3,1.224,"(30,40]" 3 | "Yes",12,92,62,7,27.6,0.926,"(40,50]" 4 | "Yes",11,143,94,33,36.6,0.254,"(50,60]" 5 | "Yes",9,164,84,21,30.8,0.831,"(30,40]" 6 | "Yes",8,176,90,34,33.7,0.467,"(50,60]" 7 | "Yes",8,154,78,32,32.4,0.443,"(40,50]" 8 | "Yes",5,139,80,35,31.6,0.361,"(20,30]" 9 | "Yes",5,158,84,41,39.4,0.395,"(20,30]" 10 | "Yes",1,115,70,30,34.6,0.529,"(30,40]" 11 | "Yes",7,150,78,29,35.2,0.692,"(50,60]" 12 | "Yes",10,148,84,48,37.6,1.001,"(50,60]" 13 | "Yes",3,129,92,49,36.4,0.968,"(30,40]" 14 | "Yes",0,198,66,32,41.3,0.502,"(20,30]" 15 | "Yes",0,188,82,14,32,0.682,"(20,30]" 16 | "Yes",3,158,76,36,31.6,0.851,"(20,30]" 17 | "Yes",0,151,90,46,42.1,0.371,"(20,30]" 18 | "Yes",0,95,85,25,37.4,0.247,"(20,30]" 19 | "Yes",14,175,62,30,33.6,0.212,"(30,40]" 20 | "Yes",7,129,68,49,38.5,0.439,"(40,50]" 21 | "Yes",8,155,62,26,34,0.543,"(40,50]" 22 | "Yes",1,180,78.5224821507334,35.5864693480366,43.3,0.282,"(40,50]" 23 | "Yes",8,125,96,33.8348977342098,34.7868326916975,0.232,"(50,60]" 24 | "Yes",2,118,80,30.6522443724509,42.9,0.693,"(20,30]" 25 | "Yes",8,133,72,32.6743195210369,32.9,0.27,"(30,40]" 26 | "Yes",0,141,76.1270147394716,35.5136628118178,42.4,0.205,"(20,30]" 27 | "Yes",3,141,72.1571733202164,30.8080996185329,30,0.761,"(20,30]" 28 | "No",0,165,76,43,47.9,0.259,"(20,30]" 29 | "No",4,99,76,15,23.2,0.223,"(20,30]" 30 | "No",2,99,70,16,20.4,0.235,"(20,30]" 31 | "No",2,110,74,29,32.4,0.698,"(20,30]" 32 | "No",3,148,66,25,32.5,0.256,"(20,30]" 33 | "No",1,71,48,18,20.4,0.323,"(20,30]" 34 | "No",0,119,66,27,38.8,0.259,"(20,30]" 35 | "No",1,97,68,21,27.2,1.095,"(20,30]" 36 | "No",0,86,68,32,35.8,0.238,"(20,30]" 37 | "No",2,125,60,20,33.8,0.088,"(30,40]" 38 | "No",1,114,66,36,38.1,0.289,"(20,30]" 39 | "No",6,92,62,32,32,0.085,"(40,50]" 40 | "No",0,135,94,46,40.6,0.284,"(20,30]" 41 | "No",2,121,70,32,39.1,0.886,"(20,30]" 42 | "No",12,121,78,17,26.5,0.259,"(60,70]" 43 | "No",4,110,76,20,28.4,0.118,"(20,30]" 44 | "No",3,61,82,28,34.4,0.243,"(40,50]" 45 | "No",6,111,64,39,34.2,0.26,"(20,30]" 46 | "No",1,81,74,41,46.3,1.096,"(30,40]" 47 | "No",0,137,84,27,27.3,0.231,"(50,60]" 48 | "No",1,97,70,40,38.1,0.218,"(20,30]" 49 | "No",2,105,58,40,34.9,0.225,"(20,30]" 50 | "No",0,100,88,60,46.8,0.962,"(30,40]" 51 | "No",1,90,62,12,27.2,0.58,"(20,30]" 52 | "No",3,124,80,33,33.2,0.305,"(20,30]" 53 | "No",5,139,64,35,28.6,0.411,"(20,30]" 54 | -------------------------------------------------------------------------------- /models/linear-model-glm.pmml: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 2019-07-04 16:20:00 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 |
89 | -------------------------------------------------------------------------------- /models/linear-model-lasso.pmml: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 2019-07-05 15:11:34 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 |
84 | -------------------------------------------------------------------------------- /models/linear-model-lm.pmml: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 2019-07-02 14:58:43 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 |
52 | -------------------------------------------------------------------------------- /models/linear-model-lmc.pmml: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 2021-05-26T11:59:49Z 6 |
7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 |
56 | -------------------------------------------------------------------------------- /models/linear-model-ridge.pmml: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 2019-07-05 14:48:04 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 |
90 | -------------------------------------------------------------------------------- /models/linear-model-ridgec.pmml: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 2021-05-21 21:56:51 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 |
93 | -------------------------------------------------------------------------------- /models/nb-cat-pima.pmml: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 2020-01-28 17:45:24 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 |
139 | -------------------------------------------------------------------------------- /models/nn-iris.pmml: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 2022-01-28 11:41:54 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 |
129 | -------------------------------------------------------------------------------- /models/tree-cat-pima-regression.pmml: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 2021-06-04T14:28:07Z 6 |
7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 |
57 | -------------------------------------------------------------------------------- /models/tree-cat-pima.pmml: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 2019-02-01 17:43:13 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | "(20,30]" "(60,70]" 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | "(20,30]" "(60,70]" 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 |
94 | -------------------------------------------------------------------------------- /models/tree-cat.pmml: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 2019-02-01 14:07:13 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | "category A" 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | "category B" "category C" 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | "category C" 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 |
69 | -------------------------------------------------------------------------------- /models/tree-iris.pmml: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 2021-07-06T10:18:03Z 6 |
7 | 8 | PMMLPipeline(steps=[('classifier', DecisionTreeClassifier(random_state=1))]) 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 |
112 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=40.8.0", "wheel", "Cython", "numpy>=1.16.0"] 3 | build-backend = "setuptools.build_meta:__legacy__" 4 | 5 | [tool.cibuildwheel] 6 | before-build = "python -m pip install cython numpy" 7 | skip = "pp*" 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.26.4 2 | scipy==1.15.1 3 | scikit-learn==1.5.2 4 | cached-property==2.0.1 5 | pytest==8.3.4 6 | pandas==2.2.3 7 | pytest-cov==6.0.0 8 | Cython==3.0.10 9 | sphinx==8.1.3 10 | numpydoc==1.8.0 11 | sphinx-autoapi==3.4.0 12 | pydata-sphinx-theme==0.16.1 13 | sphinx-github-changelog==1.4.0 14 | myst-parser==4.0.0 15 | sklearn2pmml==0.113.0 16 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function, absolute_import 2 | from sklearn_pmml_model import __version__ as version 3 | import platform 4 | 5 | # Choose build type. 6 | build_type="optimized" # "debug" 7 | 8 | # Long description for package homepage on PyPI 9 | with open("README.md", "r") as fh: 10 | long_description = fh.read() 11 | 12 | ######################################################### 13 | # Init 14 | ######################################################### 15 | 16 | # check for Python 2.7 or later 17 | # http://stackoverflow.com/questions/19534896/enforcing-python-version-in-setup-py 18 | import sys 19 | if sys.version_info < (2,7): 20 | sys.exit('Sorry, Python < 2.7 is not supported') 21 | 22 | import os 23 | 24 | from setuptools import setup, find_packages 25 | from setuptools.extension import Extension 26 | 27 | try: 28 | from Cython.Build import cythonize 29 | except ImportError: 30 | sys.exit("Cython not found. Cython is needed to build the extension modules.") 31 | 32 | 33 | ######################################################### 34 | # Definitions 35 | ######################################################### 36 | 37 | # Define our base set of compiler and linker flags. 38 | # 39 | # This is geared toward x86_64, see 40 | # https://gcc.gnu.org/onlinedocs/gcc-4.6.4/gcc/i386-and-x86_002d64-Options.html 41 | # 42 | # Customize these as needed. 43 | # 44 | # Note that -O3 may sometimes cause mysterious problems, so we limit ourselves to -O2. 45 | 46 | # Modules involving numerical computations 47 | if platform.system() == 'Darwin' and platform.machine() == 'aarch64' or 'universal2' or 'arm64' in os.environ.get('CIBW_ARCHS', ''): # Apple Silicon 48 | extra_compile_args_math_optimized = ['-O2'] 49 | extra_compile_args_math_debug = ['-O0', '-g'] 50 | else: 51 | extra_compile_args_math_optimized = ['-mtune=native', '-march=native', '-O2', '-msse', '-msse2', '-mfma', '-mfpmath=sse'] 52 | extra_compile_args_math_debug = ['-mtune=native', '-march=native', '-O0', '-g'] 53 | 54 | extra_link_args_math_optimized = [] 55 | extra_link_args_math_debug = [] 56 | 57 | # Modules that do not involve numerical computations 58 | extra_compile_args_nonmath_optimized = ['-O2'] 59 | extra_compile_args_nonmath_debug = ['-O0', '-g'] 60 | extra_link_args_nonmath_optimized = [] 61 | extra_link_args_nonmath_debug = [] 62 | 63 | # Additional flags to compile/link with OpenMP 64 | openmp_compile_args = ['-fopenmp'] 65 | openmp_link_args = ['-fopenmp'] 66 | 67 | 68 | ######################################################### 69 | # Helpers 70 | ######################################################### 71 | 72 | # Make absolute cimports work. 73 | # 74 | # See 75 | # https://github.com/cython/cython/wiki/PackageHierarchy 76 | # 77 | # For example: my_include_dirs = [np.get_include()] 78 | import numpy as np 79 | my_include_dirs = [".", np.get_include()] 80 | 81 | 82 | # Choose the base set of compiler and linker flags. 83 | if build_type == 'optimized': 84 | my_extra_compile_args_math = extra_compile_args_math_optimized 85 | my_extra_compile_args_nonmath = extra_compile_args_nonmath_optimized 86 | my_extra_link_args_math = extra_link_args_math_optimized 87 | my_extra_link_args_nonmath = extra_link_args_nonmath_optimized 88 | my_debug = False 89 | print( "build configuration selected: optimized" ) 90 | elif build_type == 'debug': 91 | my_extra_compile_args_math = extra_compile_args_math_debug 92 | my_extra_compile_args_nonmath = extra_compile_args_nonmath_debug 93 | my_extra_link_args_math = extra_link_args_math_debug 94 | my_extra_link_args_nonmath = extra_link_args_nonmath_debug 95 | my_debug = True 96 | print( "build configuration selected: debug" ) 97 | else: 98 | raise ValueError("Unknown build configuration '%s'; valid: 'optimized', 'debug'" % (build_type)) 99 | 100 | 101 | def declare_cython_extension(extName, use_math=False, use_openmp=False, include_dirs=None): 102 | """Declare a Cython extension module for setuptools. 103 | Parameters: 104 | extName : str 105 | Absolute module name, e.g. use `mylibrary.mypackage.mymodule` 106 | for the Cython source file `mylibrary/mypackage/mymodule.pyx`. 107 | use_math : bool 108 | If True, set math flags and link with ``libm``. 109 | use_openmp : bool 110 | If True, compile and link with OpenMP. 111 | Return value: 112 | Extension object 113 | that can be passed to ``setuptools.setup``. 114 | """ 115 | extPath = extName.replace(".", os.path.sep)+".pyx" 116 | 117 | if use_math and os.name != 'nt': # Windows crashes when using m library 118 | compile_args = list(my_extra_compile_args_math) # copy 119 | link_args = list(my_extra_link_args_math) 120 | libraries = ["m"] # link libm; this is a list of library names without the "lib" prefix 121 | else: 122 | compile_args = list(my_extra_compile_args_nonmath) 123 | link_args = list(my_extra_link_args_nonmath) 124 | libraries = None # value if no libraries, see setuptools.extension._Extension 125 | 126 | # OpenMP 127 | if use_openmp: 128 | compile_args.insert( 0, openmp_compile_args ) 129 | link_args.insert( 0, openmp_link_args ) 130 | 131 | # See 132 | # http://docs.cython.org/src/tutorial/external.html 133 | # 134 | # on linking libraries to your Cython extensions. 135 | return Extension( 136 | extName, 137 | [extPath], 138 | extra_compile_args=compile_args, 139 | extra_link_args=link_args, 140 | include_dirs=include_dirs, 141 | libraries=libraries 142 | ) 143 | 144 | 145 | ######################################################### 146 | # Set up modules 147 | ######################################################### 148 | 149 | ext_module_tree = declare_cython_extension("sklearn_pmml_model.tree._tree", use_math=True, use_openmp=False, include_dirs=my_include_dirs) 150 | ext_module_quad_tree = declare_cython_extension("sklearn_pmml_model.tree.quad_tree", use_math=True, use_openmp=False, include_dirs=my_include_dirs) 151 | ext_module_criterion = declare_cython_extension("sklearn_pmml_model.tree._criterion", use_math=True, use_openmp=False, include_dirs=my_include_dirs) 152 | ext_module_splitter = declare_cython_extension("sklearn_pmml_model.tree._splitter", use_math=True, use_openmp=False, include_dirs=my_include_dirs) 153 | ext_module_utils = declare_cython_extension("sklearn_pmml_model.tree._utils", use_math=True, use_openmp=False, include_dirs=my_include_dirs) 154 | ext_module_gb = declare_cython_extension("sklearn_pmml_model.ensemble._gradient_boosting", use_math=True, use_openmp=False, include_dirs=my_include_dirs) 155 | 156 | cython_ext_modules = [ext_module_tree, ext_module_quad_tree, ext_module_criterion, ext_module_splitter, ext_module_utils, ext_module_gb] 157 | 158 | # Call cythonize() explicitly, as recommended in the Cython documentation. See 159 | # http://cython.readthedocs.io/en/latest/src/reference/compilation.html#compiling-with-distutils 160 | # 161 | # This will favor Cython's own handling of '.pyx' sources over that provided by setuptools. 162 | # 163 | # Note that my_ext_modules is just a list of Extension objects. We could add any C sources (not coming from Cython modules) here if needed. 164 | # cythonize() just performs the Cython-level processing, and returns a list of Extension objects. 165 | my_ext_modules = cythonize(cython_ext_modules, include_path=my_include_dirs, gdb_debug=my_debug, compiler_directives={'legacy_implicit_noexcept': True}) 166 | 167 | 168 | ######################################################### 169 | # Call setup() 170 | ######################################################### 171 | 172 | setup( 173 | name="sklearn-pmml-model", 174 | version=version, 175 | author="Dennis Collaris", 176 | author_email="d.collaris@me.com", 177 | description = "A library to parse PMML models into Scikit-learn estimators.", 178 | long_description = long_description, 179 | long_description_content_type="text/markdown", 180 | url="https://github.com/iamDecode/sklearn-pmml-model", 181 | license = "BSD-2-Clause", 182 | classifiers = [ 183 | "Programming Language :: Python :: 3", 184 | "License :: OSI Approved :: BSD License", 185 | "Operating System :: OS Independent", 186 | "Intended Audience :: Developers", 187 | "Intended Audience :: Science/Research", 188 | "Topic :: Software Development", 189 | "Topic :: Scientific/Engineering" 190 | ], 191 | 192 | setup_requires = ["cython", "numpy>=1.16.0", "pytest-runner"], 193 | install_requires = [ 194 | 'numpy>=1.16.0', 195 | 'pandas', 196 | 'scipy', 197 | 'scikit-learn', 198 | 'cached-property' 199 | ], 200 | tests_require = [ 201 | 'pytest', 202 | ], 203 | ext_modules = my_ext_modules, 204 | packages=find_packages(), 205 | 206 | # Install also Cython headers so that other Cython modules can cimport ours 207 | # 208 | # Fileglobs relative to each package, **does not** automatically recurse into subpackages. 209 | # FIXME: force sdist, but sdist only, to keep the .pyx files (this puts them also in the bdist) 210 | package_data={'sklearn_pmml_model.tree': ['*.pxd', '*.pyx'], 'sklearn_pmml_model.ensemble': ['*.pxd', '*.pyx']}, 211 | 212 | # Disable zip_safe, because: 213 | # - Cython won't find .pxd files inside installed .egg, hard to compile libs depending on this one 214 | # - dynamic loader may need to have the library unzipped to a temporary directory anyway (at import time) 215 | zip_safe = False 216 | ) 217 | -------------------------------------------------------------------------------- /sklearn_pmml_model/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | sklearn-pmml-model. 3 | 4 | A Python library that provides import functionality to all major estimator 5 | classes of the popular machine learning library scikit-learn using PMML. 6 | This enables portability and interoperability with a wide range of different 7 | languages, toolkits and enterprise software. 8 | """ 9 | 10 | # License: BSD 2-Clause 11 | 12 | __version__ = '1.0.7' 13 | -------------------------------------------------------------------------------- /sklearn_pmml_model/auto_detect/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`sklearn.auto_detect` module implements methods to automatically 3 | detect the type of model from a PMML file. 4 | """ 5 | 6 | # License: BSD 2-Clause 7 | 8 | from .base import auto_detect_estimator, auto_detect_classifier, auto_detect_regressor 9 | 10 | __all__ = [ 11 | 'auto_detect_estimator', 12 | 'auto_detect_classifier', 13 | 'auto_detect_regressor', 14 | ] 15 | -------------------------------------------------------------------------------- /sklearn_pmml_model/auto_detect/base.py: -------------------------------------------------------------------------------- 1 | import io 2 | from collections.abc import Iterator 3 | 4 | from sklearn_pmml_model.base import PMMLBaseEstimator 5 | from sklearn_pmml_model.datatypes import Category 6 | from sklearn_pmml_model.tree import PMMLTreeClassifier, PMMLTreeRegressor 7 | from sklearn_pmml_model.ensemble import PMMLForestClassifier, PMMLForestRegressor, PMMLGradientBoostingClassifier, \ 8 | PMMLGradientBoostingRegressor 9 | from sklearn_pmml_model.neural_network import PMMLMLPClassifier, PMMLMLPRegressor 10 | from sklearn_pmml_model.svm import PMMLSVC, PMMLSVR 11 | from sklearn_pmml_model.naive_bayes import PMMLGaussianNB 12 | from sklearn_pmml_model.linear_model import PMMLLogisticRegression, PMMLLinearRegression, PMMLRidgeClassifier, PMMLRidge 13 | from sklearn_pmml_model.neighbors import PMMLKNeighborsClassifier, PMMLKNeighborsRegressor 14 | 15 | 16 | def auto_detect_estimator(pmml, **kwargs): 17 | """ 18 | Automatically detect and return the described estimator from PMML file. 19 | 20 | Parameters 21 | ---------- 22 | pmml : str, object 23 | Filename or file object containing PMML data. 24 | 25 | """ 26 | if isinstance(pmml, io.IOBase) and not pmml.seekable(): 27 | content = pmml.read() 28 | if isinstance(content, bytes): 29 | pmml = io.BytesIO(content) 30 | if isinstance(content, str): 31 | pmml = io.StringIO(content) 32 | 33 | base = PMMLBaseEstimator(pmml=pmml) 34 | target_field_name = base.target_field.attrib['name'] 35 | target_field_type = base.field_mapping[target_field_name][1] 36 | 37 | if isinstance(target_field_type, Category) or target_field_type is str: 38 | return auto_detect_classifier(pmml, **kwargs) 39 | else: 40 | return auto_detect_regressor(pmml, **kwargs) 41 | 42 | 43 | def auto_detect_classifier(pmml, **kwargs): 44 | """ 45 | Automatically detect and return the described classifier from PMML file. 46 | 47 | Parameters 48 | ---------- 49 | pmml : str, object 50 | Filename or file object containing PMML data. 51 | 52 | """ 53 | 54 | def parse(file: Iterator, seek=False): 55 | for line in file: 56 | if isinstance(line, bytes): 57 | line = line.decode('utf8') 58 | if ' element, that contains 19 | various elements. Each segment contains it's own . 20 | For Random Forests, only segments with a predicate are supported. 21 | 22 | Parameters 23 | ---------- 24 | pmml : str, object 25 | Filename or file object containing PMML data. 26 | 27 | n_jobs : int or None, optional (default=None) 28 | The number of jobs to run in parallel for the `predict` method. 29 | ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 30 | ``-1`` means using all processors. 31 | 32 | Notes 33 | ----- 34 | Specification: http://dmg.org/pmml/v4-3/MultipleModels.html 35 | 36 | """ 37 | 38 | def __init__(self, pmml, n_jobs=None): 39 | PMMLBaseClassifier.__init__(self, pmml) 40 | 41 | mining_model = self.root.find('MiningModel') 42 | if mining_model is None: 43 | raise Exception('PMML model does not contain MiningModel.') 44 | 45 | segmentation = mining_model.find('Segmentation') 46 | if segmentation is None: 47 | raise Exception('PMML model does not contain Segmentation.') 48 | 49 | if segmentation.get('multipleModelMethod') not in ['majorityVote', 'average']: 50 | raise Exception('PMML model ensemble should use majority vote or average.') 51 | 52 | # Parse segments 53 | segments = segmentation.findall('Segment') 54 | valid_segments = [segment for segment in segments if segment.find('True') is not None] 55 | 56 | if len(valid_segments) < len(segments): 57 | warnings.warn( 58 | 'Warning: {} segment(s) ignored because of unsupported predicate.' 59 | .format(len(segments) - len(valid_segments)) 60 | ) 61 | 62 | n_estimators = len(valid_segments) 63 | RandomForestClassifier.__init__(self, n_estimators=n_estimators, n_jobs=n_jobs) 64 | self._validate_estimator() 65 | 66 | clf = self._make_estimator(append=False, random_state=123) 67 | clf.classes_ = self.classes_ 68 | try: 69 | clf.n_features_in_ = self.n_features_in_ 70 | except AttributeError: 71 | clf.n_features_ = self.n_features_ 72 | clf.n_outputs_ = self.n_outputs_ 73 | clf.n_classes_ = self.n_classes_ 74 | self.template_estimator = clf 75 | 76 | self.estimators_ = [get_tree(self, s) for s in valid_segments] 77 | 78 | # Required after constructing trees, because categories may be inferred in 79 | # the parsing process 80 | target = self.target_field.get('name') 81 | fields = [field for name, field in self.fields.items() if name != target] 82 | for clf in self.estimators_: 83 | n_categories = np.asarray([ 84 | len(self.field_mapping[field.get('name')][1].categories) 85 | if field.get('optype') == 'categorical' else -1 86 | for field in fields 87 | if field.tag == 'DataField' 88 | ], dtype=np.int32, order='C') 89 | clf.n_categories = n_categories 90 | clf.tree_.set_n_categories(n_categories) 91 | 92 | self.categorical = [x != -1 for x in self.estimators_[0].n_categories] 93 | 94 | def fit(self, x, y): 95 | return PMMLBaseClassifier.fit(self, x, y) 96 | 97 | def _more_tags(self): 98 | return RandomForestClassifier._more_tags(self) 99 | 100 | 101 | class PMMLForestRegressor(IntegerEncodingMixin, PMMLBaseRegressor, RandomForestRegressor): 102 | """ 103 | A random forest regressor. 104 | 105 | A random forest is a meta estimator that fits a number of decision tree 106 | classifiers on various sub-samples of the dataset and uses averaging to 107 | improve the predictive accuracy and control over-fitting. 108 | 109 | The PMML model consists out of a element, that contains 110 | various elements. Each segment contains it's own . 111 | For Random Forests, only segments with a predicate are supported. 112 | 113 | Parameters 114 | ---------- 115 | pmml : str, object 116 | Filename or file object containing PMML data. 117 | 118 | n_jobs : int or None, optional (default=None) 119 | The number of jobs to run in parallel for the `predict` method. 120 | ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 121 | ``-1`` means using all processors. 122 | 123 | Notes 124 | ----- 125 | Specification: http://dmg.org/pmml/v4-3/MultipleModels.html 126 | 127 | """ 128 | 129 | def __init__(self, pmml, n_jobs=None): 130 | PMMLBaseRegressor.__init__(self, pmml) 131 | 132 | mining_model = self.root.find('MiningModel') 133 | if mining_model is None: 134 | raise Exception('PMML model does not contain MiningModel.') 135 | 136 | segmentation = mining_model.find('Segmentation') 137 | if segmentation is None: 138 | raise Exception('PMML model does not contain Segmentation.') 139 | 140 | if segmentation.get('multipleModelMethod') not in ['majorityVote', 'average']: 141 | raise Exception('PMML model ensemble should use majority vote or average.') 142 | 143 | # Parse segments 144 | segments = segmentation.findall('Segment') 145 | valid_segments = [segment for segment in segments if segment.find('True') is not None] 146 | 147 | if len(valid_segments) < len(segments): 148 | warnings.warn( 149 | 'Warning: {} segment(s) ignored because of unsupported predicate.'.format( 150 | len(segments) - len(valid_segments) 151 | ) 152 | ) 153 | 154 | n_estimators = len(valid_segments) 155 | self.n_outputs_ = 1 156 | RandomForestRegressor.__init__(self, n_estimators=n_estimators, n_jobs=n_jobs) 157 | self._validate_estimator() 158 | 159 | clf = self._make_estimator(append=False, random_state=123) 160 | try: 161 | clf.n_features_in_ = self.n_features_in_ 162 | except AttributeError: 163 | clf.n_features_ = self.n_features_ 164 | clf.n_outputs_ = self.n_outputs_ 165 | self.template_estimator = clf 166 | 167 | self.estimators_ = [get_tree(self, s, rescale_factor=0.1) for s in valid_segments] 168 | 169 | # Required after constructing trees, because categories may be inferred in 170 | # the parsing process 171 | target = self.target_field.get('name') 172 | fields = [field for name, field in self.fields.items() if name != target] 173 | for clf in self.estimators_: 174 | n_categories = np.asarray([ 175 | len(self.field_mapping[field.get('name')][1].categories) 176 | if field.get('optype') == 'categorical' else -1 177 | for field in fields 178 | if field.tag == 'DataField' 179 | ], dtype=np.int32, order='C') 180 | clf.n_categories = n_categories 181 | clf.tree_.set_n_categories(n_categories) 182 | 183 | self.categorical = [x != -1 for x in self.estimators_[0].n_categories] 184 | 185 | def fit(self, x, y): 186 | return PMMLBaseRegressor.fit(self, x, y) 187 | 188 | def _more_tags(self): 189 | return RandomForestRegressor._more_tags(self) 190 | -------------------------------------------------------------------------------- /sklearn_pmml_model/linear_model/README.md: -------------------------------------------------------------------------------- 1 | # sklearn-pmml-model.linear_model 2 | 3 | This package contains the `PMMLLinearRegression` (`lm` in R) as well as `PMMLRidge`, `PMMLLasso` and `PMMLElasticNet` (`glm` and `glmnet` in R). 4 | 5 | ## Example 6 | A minimal working example is shown below: 7 | 8 | ```python 9 | import pandas as pd 10 | from sklearn_pmml_model.linear_model import PMMLLinearRegression 11 | 12 | # Prepare data 13 | df = pd.read_csv('models/categorical-test.csv') 14 | Xte = df.iloc[:, 1:] 15 | 16 | clf = PMMLLinearRegression(pmml="models/linear-model-lm.pmml") 17 | clf.predict(Xte) 18 | ``` -------------------------------------------------------------------------------- /sklearn_pmml_model/linear_model/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`sklearn_pmml_model.linear_model` module implements generalized linear models. 3 | """ 4 | 5 | # License: BSD 2-Clause 6 | 7 | from .implementations import PMMLLinearRegression, PMMLLogisticRegression, PMMLRidge, \ 8 | PMMLRidgeClassifier, PMMLLasso, PMMLElasticNet 9 | 10 | __all__ = [ 11 | 'PMMLLinearRegression', 12 | 'PMMLLogisticRegression', 13 | 'PMMLRidge', 14 | 'PMMLRidgeClassifier', 15 | 'PMMLLasso', 16 | 'PMMLElasticNet' 17 | ] 18 | -------------------------------------------------------------------------------- /sklearn_pmml_model/linear_model/base.py: -------------------------------------------------------------------------------- 1 | # License: BSD 2-Clause 2 | 3 | from sklearn_pmml_model.base import PMMLBaseRegressor, PMMLBaseClassifier, OneHotEncodingMixin 4 | import numpy as np 5 | from itertools import chain 6 | 7 | 8 | class PMMLGeneralizedLinearRegressor(OneHotEncodingMixin, PMMLBaseRegressor): 9 | """ 10 | Abstract class for Generalized Linear Models (GLMs). 11 | 12 | The PMML model consists out of a element, 13 | containing a element that contains zero or more 14 | elements describing the coefficients for each parameter. Parameters 15 | are described in the element, that maps parameters to fields in 16 | the data. 17 | 18 | Parameters 19 | ---------- 20 | pmml : str, object 21 | Filename or file object containing PMML data. 22 | 23 | Notes 24 | ----- 25 | Specification: http://dmg.org/pmml/v4-3/GeneralRegression.html 26 | 27 | """ 28 | 29 | def __init__(self, pmml): 30 | PMMLBaseRegressor.__init__(self, pmml) 31 | OneHotEncodingMixin.__init__(self) 32 | 33 | # Import coefficients and intercepts 34 | model = self.root.find('GeneralRegressionModel') 35 | 36 | if model is None: 37 | raise Exception('PMML model does not contain GeneralRegressionModel.') 38 | 39 | self.coef_ = np.array(_get_coefficients(self, model)) 40 | self.intercept_ = _get_intercept(model) 41 | 42 | 43 | class PMMLGeneralizedLinearClassifier(OneHotEncodingMixin, PMMLBaseClassifier): 44 | """ 45 | Abstract class for Generalized Linear Models (GLMs). 46 | 47 | The PMML model consists out of a element, 48 | containing a element that contains zero or more 49 | elements describing the coefficients for each parameter. Parameters 50 | are described in the element, that maps parameters to fields in 51 | the data. 52 | 53 | Parameters 54 | ---------- 55 | pmml : str, object 56 | Filename or file object containing PMML data. 57 | 58 | Notes 59 | ----- 60 | Specification: http://dmg.org/pmml/v4-3/GeneralRegression.html 61 | 62 | """ 63 | 64 | def __init__(self, pmml): 65 | PMMLBaseClassifier.__init__(self, pmml) 66 | OneHotEncodingMixin.__init__(self) 67 | 68 | # Import coefficients and intercepts 69 | model = self.root.find('GeneralRegressionModel') 70 | 71 | if model is None: 72 | raise Exception('PMML model does not contain GeneralRegressionModel.') 73 | 74 | self.coef_ = np.array([_get_coefficients(self, model)]) 75 | self.intercept_ = _get_intercept(model) 76 | 77 | 78 | def _get_coefficients(linear_model, model): 79 | """ 80 | Obtain the coefficients for the GLM regression. 81 | 82 | Raises an exception when we notice non linear parameter configurations. 83 | 84 | Parameters 85 | ---------- 86 | linear_model : PMMLGeneralizedLinearRegressor, PMMLGeneralizedLinearClassifier 87 | The PMML class representing the classifier. Should contain at least target_field, 88 | fields and field_mapping properties. 89 | 90 | model : eTree.Element 91 | The element that is assumed to contains a 92 | and element. 93 | 94 | Returns 95 | ------- 96 | coefficients: numpy.ndarray 97 | Coefficient value for every field. Zero if not present. 98 | 99 | """ 100 | pp = model.find('PPMatrix') 101 | params = model.find('ParamMatrix') 102 | 103 | def coefficient_for_parameter(p): 104 | if not p: 105 | return 0 106 | 107 | pcells = params.findall(f"PCell[@parameterName='{p}']") 108 | if len(pcells) > 1: 109 | raise Exception('This model does not support multiple outputs.') 110 | 111 | if not pcells: 112 | return 0 113 | 114 | return float(pcells[0].get('beta')) 115 | 116 | def parameter_for_category(cells, category): 117 | cell = [cell for cell in cells if cell.get('value') == category] 118 | 119 | if not cell: 120 | return None 121 | 122 | return cell[0].get('parameterName') 123 | 124 | def coefficients_for_field(name, field): 125 | pp_cells = pp.findall(f"PPCell[@predictorName='{name}']") 126 | 127 | if not pp_cells: 128 | return [0] 129 | 130 | if field.get('optype') != 'categorical': 131 | if len(pp_cells) > 1: 132 | raise Exception('PMML model is not linear.') 133 | 134 | return [coefficient_for_parameter(pp_cells[0].get('parameterName'))] 135 | 136 | return [ 137 | coefficient_for_parameter(parameter_for_category(pp_cells, c)) 138 | for c in linear_model.field_mapping[name][1].categories 139 | ] 140 | 141 | target = linear_model.target_field.get('name') 142 | fields = {name: field for name, field in linear_model.fields.items() if name != target} 143 | 144 | return list(chain.from_iterable([ 145 | coefficients_for_field(name, field) 146 | for name, field in fields.items() 147 | 148 | ])) 149 | 150 | 151 | def _get_intercept(model): 152 | """ 153 | Find all parameters that are not included in the . 154 | 155 | These constitute the intercept. In the very unlikely case there are multiple 156 | parameters fitting this criteria, we sum the result. 157 | 158 | Parameters 159 | ---------- 160 | model : eTree.Element 161 | The element that is assumed to contains a 162 | and element. 163 | 164 | Returns 165 | ------- 166 | intercept : float 167 | Value of the intercept of the method. 168 | 169 | """ 170 | pp = model.find('PPMatrix') 171 | params = model.find('ParamMatrix') 172 | 173 | specified = [p.get('parameterName') for p in pp.findall('PPCell')] 174 | used = [p.get('parameterName') for p in params.findall('PCell')] 175 | 176 | intercepts = set(used) - set(specified) 177 | intercepts = list(chain.from_iterable([ 178 | params.findall(f"PCell[@parameterName='{p}']") 179 | for p in intercepts 180 | ])) 181 | 182 | return sum([float(i.get('beta')) for i in intercepts]) 183 | -------------------------------------------------------------------------------- /sklearn_pmml_model/naive_bayes/README.md: -------------------------------------------------------------------------------- 1 | # sklearn-pmml-model.naive_bayes 2 | 3 | This package contains the `PMMLGaussianNB` classifier. 4 | 5 | ## Example 6 | A minimal working example is shown below: 7 | 8 | ```python 9 | import pandas as pd 10 | from sklearn_pmml_model.naive_bayes import PMMLGaussianNB 11 | 12 | # Prepare data 13 | df = pd.read_csv('models/categorical-test.csv') 14 | Xte = df.iloc[:, 1:] 15 | 16 | clf = PMMLGaussianNB(pmml="models/nb-cat-pima.pmml") 17 | clf.predict(Xte) 18 | ``` -------------------------------------------------------------------------------- /sklearn_pmml_model/naive_bayes/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`sklearn_pmml_model.naive_bayes` module implements Naive Bayes 3 | algorithms. These are supervised learning methods based on applying Bayes' 4 | theorem with strong (naive) feature independence assumptions. 5 | """ 6 | 7 | # License: BSD 2-Clause 8 | 9 | from .implementations import PMMLGaussianNB 10 | 11 | __all__ = ['PMMLGaussianNB'] 12 | -------------------------------------------------------------------------------- /sklearn_pmml_model/naive_bayes/implementations.py: -------------------------------------------------------------------------------- 1 | # License: BSD 2-Clause 2 | 3 | from sklearn_pmml_model.base import PMMLBaseClassifier, OneHotEncodingMixin 4 | from sklearn.naive_bayes import GaussianNB 5 | import numpy as np 6 | from itertools import chain 7 | 8 | 9 | class PMMLGaussianNB(OneHotEncodingMixin, PMMLBaseClassifier, GaussianNB): 10 | """ 11 | Gaussian Naive Bayes classifier. 12 | 13 | Can perform online updates to model parameters via :meth:`partial_fit`. 14 | For details on algorithm used to update feature means and variance online, 15 | see Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque: 16 | 17 | http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf 18 | 19 | Parameters 20 | ---------- 21 | pmml : str, object 22 | Filename or file object containing PMML data. 23 | 24 | Notes 25 | ----- 26 | Specification: http://dmg.org/pmml/v4-3/NaiveBayes.html 27 | 28 | """ 29 | 30 | def __init__(self, pmml): 31 | PMMLBaseClassifier.__init__(self, pmml) 32 | OneHotEncodingMixin.__init__(self) 33 | 34 | model = self.root.find('NaiveBayesModel') 35 | 36 | if model is None: 37 | raise Exception('PMML model does not contain NaiveBayesModel.') 38 | 39 | inputs = model.find('BayesInputs') 40 | 41 | target_values = { 42 | target: self._get_target_values(inputs, target) 43 | for target in self.classes_ 44 | } 45 | 46 | try: 47 | outputs = model.find('BayesOutput').find('TargetValueCounts').findall('TargetValueCount') 48 | counts = [int(x.get('count')) for x in outputs] 49 | self.class_prior_ = np.array([x / np.sum(counts) for x in counts]) 50 | except AttributeError: 51 | self.class_prior_ = np.array([1 / len(self.classes_) for _ in self.classes_]) 52 | 53 | self.theta_ = np.array([ 54 | [float(value.get('mean', 0)) for value in target_values[target]] 55 | for target in self.classes_ 56 | ]) 57 | try: 58 | self.sigma_ = np.array([ 59 | [float(value.get('variance', 0)) for value in target_values[target]] 60 | for target in self.classes_ 61 | ]) 62 | finally: 63 | pass 64 | 65 | try: 66 | self.var_ = np.array([ 67 | [float(value.get('variance', 0)) for value in target_values[target]] 68 | for target in self.classes_ 69 | ]) 70 | finally: 71 | pass 72 | 73 | def _get_target_values(self, inputs, target): 74 | def target_value_for_category(bayesInput, category): 75 | counts = bayesInput.find(f"PairCounts[@value='{category}']") 76 | target_counts = counts.find('TargetValueCounts') 77 | return target_counts.find(f"TargetValueCount[@value='{target}']") 78 | 79 | def target_value_for_field(name, field): 80 | bayesInput = inputs.find(f"BayesInput[@fieldName='{name}']") 81 | 82 | if field.get('optype') != 'categorical': 83 | stats = bayesInput.find('TargetValueStats') 84 | targetValue = stats.find(f"TargetValueStat[@value='{target}']") 85 | distribution = targetValue.find('GaussianDistribution') 86 | 87 | if distribution is None: 88 | distributionName = targetValue.find('*').tag 89 | raise NotImplementedError(f'Distribution "{distributionName}" not implemented, or not supported ' 90 | f'by scikit-learn') 91 | 92 | return [distribution] 93 | else: 94 | counts = [ 95 | float(target_value_for_category(bayesInput, c).get('count')) 96 | for c in self.field_mapping[name][1].categories 97 | ] 98 | return [ 99 | { 100 | 'mean': count / np.sum(counts), 101 | 'variance': 999999999 102 | } 103 | for count in counts 104 | ] 105 | 106 | return list(chain.from_iterable([ 107 | target_value_for_field(name, field) 108 | for name, field in self.fields.items() 109 | if field is not self.target_field 110 | ])) 111 | 112 | def fit(self, x, y): 113 | return PMMLBaseClassifier.fit(self, x, y) 114 | 115 | def _more_tags(self): 116 | return GaussianNB._more_tags(self) 117 | -------------------------------------------------------------------------------- /sklearn_pmml_model/neighbors/README.md: -------------------------------------------------------------------------------- 1 | # sklearn-pmml-model.neighbors 2 | 3 | This package contains `PMMLKNeighborsClassifier` and `PMMLKNeighborsRegressor`. 4 | 5 | ## Example 6 | A minimal working example is shown below: 7 | 8 | ```python 9 | import pandas as pd 10 | import numpy as np 11 | from sklearn_pmml_model.neighbors import PMMLKNeighborsClassifier 12 | 13 | # Prepare data 14 | df = pd.read_csv('models/categorical-test.csv') 15 | cats = np.unique(df['age']) 16 | df['age'] = pd.Categorical(df['age'], categories=cats).codes + 1 17 | Xte = df.iloc[:, 1:] 18 | 19 | clf = PMMLKNeighborsClassifier(pmml="models/knn-clf-pima.pmml") 20 | clf.predict(Xte) 21 | ``` -------------------------------------------------------------------------------- /sklearn_pmml_model/neighbors/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`sklearn.neighbors` module implements the k-nearest neighbors 3 | algorithm. 4 | """ 5 | 6 | # License: BSD 2-Clause 7 | 8 | from ._classes import PMMLKNeighborsClassifier, PMMLKNeighborsRegressor 9 | 10 | __all__ = ['PMMLKNeighborsClassifier', 'PMMLKNeighborsRegressor'] 11 | -------------------------------------------------------------------------------- /sklearn_pmml_model/neighbors/_base.py: -------------------------------------------------------------------------------- 1 | # License: BSD 2-Clause 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | class PMMLBaseKNN: 8 | """ 9 | Abstract class for k-nearest neighbor models. 10 | 11 | The PMML model consists out of a element, 12 | containing a element that contains a 13 | element describing support vectors, and a element that 14 | describes the training data points, a element describing 15 | the distance metric used, and a element describing which features 16 | are considered when calculating distance. k is indicated using the 17 | numberOfNeighbors attribute on the element. 18 | 19 | Parameters 20 | ---------- 21 | leaf_size : int, default=30 22 | Leaf size passed to BallTree or KDTree. This can affect the speed of the 23 | construction and query, as well as the memory required to store the tree. 24 | The optimal value depends on the nature of the problem. 25 | 26 | Notes 27 | ----- 28 | Specification: http://dmg.org/pmml/v4-3/KNN.html 29 | 30 | """ 31 | 32 | def __init__(self, leaf_size=30): 33 | knn_model = self.root.find('NearestNeighborModel') 34 | 35 | if knn_model is None: 36 | raise Exception('PMML model does not contain NearestNeighborModel.') 37 | 38 | self.n_neighbors = int(knn_model.get('numberOfNeighbors')) 39 | self.algorithm = 'auto' 40 | self.leaf_size = leaf_size 41 | self.p = 2 42 | self.metric_params = None 43 | self.outputs_2d_ = False 44 | 45 | # Set metric and parameters 46 | measure_element = knn_model.find('ComparisonMeasure') 47 | 48 | if measure_element is None: 49 | raise Exception('PMML model does not contain ComparisonMeasure.') 50 | 51 | measure = next(x for x in measure_element) 52 | 53 | measures = { 54 | 'euclidean': 'euclidean', 55 | 'chebychev': 'chebyshev', 56 | 'cityBlock': 'manhattan', 57 | 'minkowski': 'minkowski', 58 | 'simpleMatching': 'matching', 59 | 'jaccard': 'jaccard', 60 | 'tanimoto': 'rogerstanimoto', 61 | } 62 | 63 | if measure.tag not in measures: 64 | raise Exception(f'PMML model uses unsupported distance metric: "{measure.tag}".') 65 | 66 | self.metric = measures[measure.tag] 67 | 68 | if self.metric == 'minkowski': 69 | self.p = float(measure.get('p-parameter')) 70 | self.metric_params = {'p': self.p} 71 | 72 | self._check_algorithm_metric() 73 | 74 | # Set training instances 75 | instances = knn_model.find('TrainingInstances') 76 | 77 | fields_element = instances.find('InstanceFields') 78 | mapping = {x.get('field'): x.get('column').split(':')[-1] for x in fields_element} 79 | target = self.target_field.get('name') 80 | fields = [x.get('field') for x in fields_element if x.get('field') != target] 81 | 82 | data = [ 83 | [ 84 | self.field_mapping[f][1](next(x for x in row if x.tag.endswith(mapping[f])).text) 85 | for f in fields 86 | ] 87 | for row in instances.find('InlineTable') 88 | ] 89 | 90 | self._X = pd.DataFrame(data, columns=fields) 91 | self._y = np.array([ 92 | self.field_mapping[target][1](next(x for x in row if x.tag.endswith(mapping[target])).text) 93 | for row in instances.find('InlineTable') 94 | ]) 95 | -------------------------------------------------------------------------------- /sklearn_pmml_model/neighbors/_classes.py: -------------------------------------------------------------------------------- 1 | # License: BSD 2-Clause 2 | 3 | from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor 4 | from sklearn_pmml_model.base import PMMLBaseClassifier, PMMLBaseRegressor 5 | from sklearn_pmml_model.neighbors._base import PMMLBaseKNN 6 | 7 | 8 | class PMMLKNeighborsClassifier(PMMLBaseClassifier, PMMLBaseKNN, KNeighborsClassifier): 9 | """ 10 | Classifier implementing the k-nearest neighbors vote. 11 | 12 | Parameters 13 | ---------- 14 | pmml : str, object 15 | Filename or file object containing PMML data. 16 | 17 | n_jobs : int, default=None 18 | The number of parallel jobs to run for neighbors search. 19 | ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 20 | ``-1`` means using all processors. See :term:`Glossary ` 21 | for more details. 22 | Doesn't affect :meth:`fit` method. 23 | 24 | Notes 25 | ----- 26 | Specification: http://dmg.org/pmml/v4-3/KNN.html 27 | 28 | """ 29 | 30 | def __init__(self, pmml, n_jobs=None): 31 | PMMLBaseClassifier.__init__(self, pmml) 32 | KNeighborsClassifier.__init__(self, n_jobs=n_jobs) 33 | PMMLBaseKNN.__init__(self) 34 | 35 | KNeighborsClassifier.fit(self, self._X, self._y) 36 | 37 | def fit(self, x, y): 38 | return PMMLBaseClassifier.fit(self, x, y) 39 | 40 | def _more_tags(self): 41 | return {'requires_y': True, **KNeighborsClassifier._more_tags(self)} 42 | 43 | 44 | class PMMLKNeighborsRegressor(PMMLBaseRegressor, PMMLBaseKNN, KNeighborsRegressor): 45 | """ 46 | Regression based on k-nearest neighbors. 47 | 48 | The target is predicted by local interpolation of the targets 49 | associated of the nearest neighbors in the training set. 50 | 51 | Parameters 52 | ---------- 53 | pmml : str, object 54 | Filename or file object containing PMML data. 55 | 56 | n_jobs : int, default=None 57 | The number of parallel jobs to run for neighbors search. 58 | ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 59 | ``-1`` means using all processors. See :term:`Glossary ` 60 | for more details. 61 | Doesn't affect :meth:`fit` method. 62 | 63 | Notes 64 | ----- 65 | Specification: http://dmg.org/pmml/v4-3/KNN.html 66 | 67 | """ 68 | 69 | def __init__(self, pmml, n_jobs=None): 70 | PMMLBaseRegressor.__init__(self, pmml) 71 | KNeighborsRegressor.__init__(self, n_jobs=n_jobs) 72 | PMMLBaseKNN.__init__(self) 73 | 74 | KNeighborsRegressor.fit(self, self._X, self._y) 75 | 76 | def fit(self, x, y): 77 | return PMMLBaseRegressor.fit(self, x, y) 78 | 79 | def _more_tags(self): 80 | return KNeighborsRegressor._more_tags(self) 81 | -------------------------------------------------------------------------------- /sklearn_pmml_model/neural_network/README.md: -------------------------------------------------------------------------------- 1 | # sklearn-pmml-model.neural_network 2 | 3 | This package contains `PMMLMLPClassifier` and `PMMLMLPRegressor`. 4 | 5 | ## Example 6 | A minimal working example is shown below: 7 | 8 | ```python 9 | import numpy as np 10 | import pandas as pd 11 | from sklearn_pmml_model.neural_network import PMMLMLPClassifier 12 | from sklearn.datasets import load_iris 13 | 14 | # Prepare data 15 | data = load_iris(as_frame=True) 16 | X = data.data 17 | y = pd.Series(np.array(data.target_names)[data.target]) 18 | y.name = "Class" 19 | 20 | clf = PMMLMLPClassifier(pmml="models/nn-iris.pmml") 21 | clf.predict(X) 22 | ``` -------------------------------------------------------------------------------- /sklearn_pmml_model/neural_network/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`sklearn.neural_network` module includes models based on neural networks. 3 | """ 4 | 5 | # License: BSD 2-Clause 6 | 7 | from ._classes import PMMLMLPClassifier, PMMLMLPRegressor 8 | 9 | __all__ = ['PMMLMLPClassifier', 'PMMLMLPRegressor'] 10 | -------------------------------------------------------------------------------- /sklearn_pmml_model/neural_network/_base.py: -------------------------------------------------------------------------------- 1 | # License: BSD 2-Clause 2 | 3 | import numpy as np 4 | from sklearn_pmml_model.base import PMMLBaseClassifier 5 | 6 | 7 | class PMMLBaseNeuralNetwork: 8 | """ 9 | Abstract class for Neural Network models. 10 | 11 | The PMML model consists out of a element, containing a 12 | element that describes the input layer neurons with 13 | elements. Next, a element describes all other 14 | neurons with associated weights and biases. The activation function is either 15 | specified globally with the activationFunction attribute on the 16 | element, or the same attribute on each layer. Note however 17 | that scikit-learn only supports a single activation function for all hidden 18 | layers. Finally, the element describes the output layer. 19 | The output is currently expected to match the target field in . 20 | 21 | Notes 22 | ----- 23 | Specification: http://dmg.org/pmml/v4-3/NeuralNetwork.html 24 | 25 | """ 26 | 27 | def __init__(self): 28 | nn_model = self.root.find('NeuralNetwork') 29 | 30 | if nn_model is None: 31 | raise Exception('PMML model does not contain NeuralNetwork.') 32 | 33 | inputs = nn_model.find('NeuralInputs') 34 | 35 | if inputs is None: 36 | raise Exception('PMML model does not contain NeuralInputs.') 37 | 38 | mapping = { 39 | x.find('DerivedField').find('FieldRef').get('field'): x.get('id') 40 | for x in inputs.findall('NeuralInput') 41 | } 42 | 43 | target = self.target_field.get('name') 44 | fields = [name for name, field in self.fields.items() if name != target and field.tag == 'DataField'] 45 | if set(mapping.keys()) != set(fields): 46 | raise Exception('PMML model preprocesses the data which currently unsupported.') 47 | 48 | layers = [layer for layer in nn_model.findall('NeuralLayer')] 49 | if isinstance(self, PMMLBaseClassifier) and len(self.classes_) == 2: 50 | index = next((i + 1 for i, layer in enumerate(layers) if layer.get('activationFunction') == 'identity'), None) 51 | layers = layers[:index] 52 | 53 | if len(layers) == 0: 54 | raise Exception('PMML model does not contain any NeuralLayer elements.') 55 | 56 | self.n_layers_ = len(layers) + 1 # +1 for input layer 57 | 58 | neurons = [layer.findall('Neuron') for layer in layers] 59 | self.hidden_layer_sizes = [len(neuron) for neuron in neurons][:-1] 60 | 61 | # Determine activation function 62 | activation_functions = { 63 | 'logistic': 'logistic', 64 | 'tanh': 'tanh', 65 | 'identity': 'identity', 66 | 'rectifier': 'relu' 67 | } 68 | activation_function = nn_model.get('activationFunction') 69 | 70 | if activation_function is None: 71 | activation_function = layers[0].get('activationFunction') 72 | 73 | layer_activations = [ 74 | layer.get('activationFunction') 75 | for layer in layers[:-1] 76 | if layer.get('activationFunction') is not None 77 | ] 78 | 79 | if len(np.unique([activation_function] + layer_activations)) > 1: 80 | raise Exception('Neural networks with different activation functions per ' 81 | 'layer are not currently supported by scikit-learn.') 82 | 83 | if activation_function not in activation_functions: 84 | raise Exception('PMML model uses unsupported activationFunction.') 85 | 86 | self.activation = activation_functions[activation_function] 87 | 88 | # Set neuron weights 89 | sizes = list(zip( 90 | [len(mapping)] + [len(layer) for layer in layers][:-1], 91 | [len(layer) for layer in layers] 92 | )) 93 | 94 | self.coefs_ = [np.zeros(shape=s) for s in sizes] 95 | self.intercepts_ = [ 96 | np.array([float(neuron.get('bias', 0)) for neuron in layer]) 97 | for layer in neurons 98 | ] 99 | 100 | field_ids = [mapping[field] for field in fields] 101 | for li, layer in enumerate(neurons): 102 | if li == 0: 103 | layer_ids = field_ids 104 | else: 105 | layer_ids = [x.get('id') for x in neurons[li - 1]] 106 | for ni, neuron in enumerate(layer): 107 | for connection in neuron.findall('Con'): 108 | ci = layer_ids.index(connection.get('from')) 109 | self.coefs_[li][ci, ni] = float(connection.get('weight')) 110 | -------------------------------------------------------------------------------- /sklearn_pmml_model/neural_network/_classes.py: -------------------------------------------------------------------------------- 1 | # License: BSD 2-Clause 2 | 3 | import numpy as np 4 | from sklearn.neural_network import MLPClassifier, MLPRegressor 5 | from sklearn.preprocessing import LabelBinarizer 6 | from sklearn.utils.multiclass import type_of_target 7 | from sklearn_pmml_model.base import PMMLBaseClassifier, PMMLBaseRegressor, get_type 8 | from sklearn_pmml_model.datatypes import Category 9 | from sklearn_pmml_model.neural_network._base import PMMLBaseNeuralNetwork 10 | 11 | 12 | class PMMLMLPClassifier(PMMLBaseClassifier, PMMLBaseNeuralNetwork, MLPClassifier): 13 | """ 14 | Multi-layer Perceptron classifier. 15 | 16 | Parameters 17 | ---------- 18 | pmml : str, object 19 | Filename or file object containing PMML data. 20 | 21 | Notes 22 | ----- 23 | Specification: http://dmg.org/pmml/v4-3/NeuralNetwork.html 24 | 25 | """ 26 | 27 | def __init__(self, pmml): 28 | PMMLBaseClassifier.__init__(self, pmml) 29 | MLPClassifier.__init__(self) 30 | PMMLBaseNeuralNetwork.__init__(self) 31 | 32 | if len(self.classes_) == 2: 33 | self.out_activation_ = "logistic" 34 | self.n_outputs_ = 1 35 | else: 36 | self.out_activation_ = "softmax" 37 | self.n_outputs_ = len(self.classes_) 38 | 39 | target_type: Category = get_type(self.target_field) 40 | self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) 41 | self._label_binarizer.classes_ = np.array(target_type.categories) 42 | self._label_binarizer.y_type_ = type_of_target(target_type.categories) 43 | self._label_binarizer.sparse_input_ = False 44 | 45 | def fit(self, x, y): 46 | return PMMLBaseClassifier.fit(self, x, y) 47 | 48 | def _more_tags(self): 49 | return MLPClassifier._more_tags(self) 50 | 51 | 52 | class PMMLMLPRegressor(PMMLBaseRegressor, PMMLBaseNeuralNetwork, MLPRegressor): 53 | """ 54 | Multi-layer Perceptron regressor. 55 | 56 | Parameters 57 | ---------- 58 | pmml : str, object 59 | Filename or file object containing PMML data. 60 | 61 | Notes 62 | ----- 63 | Specification: http://dmg.org/pmml/v4-3/NeuralNetwork.html 64 | 65 | """ 66 | 67 | def __init__(self, pmml): 68 | PMMLBaseRegressor.__init__(self, pmml) 69 | MLPRegressor.__init__(self) 70 | PMMLBaseNeuralNetwork.__init__(self) 71 | 72 | self.out_activation_ = "identity" 73 | 74 | def fit(self, x, y): 75 | return PMMLBaseRegressor.fit(self, x, y) 76 | 77 | def _more_tags(self): 78 | return MLPRegressor._more_tags(self) 79 | -------------------------------------------------------------------------------- /sklearn_pmml_model/svm/README.md: -------------------------------------------------------------------------------- 1 | # sklearn-pmml-model.svm 2 | 3 | This package contains the `PMMLLinearSVC`, `PMMLNuSVC` and `PMMLSVC` classifier models, as well as the `PMMLLinearSVR`, `PMMLNuSVR` and `PMMLSVR` regression models. 4 | 5 | ## Example 6 | A minimal working example is shown below: 7 | 8 | ```python 9 | import pandas as pd 10 | from sklearn_pmml_model.svm import PMMLSVC 11 | 12 | # Prepare data 13 | df = pd.read_csv('models/categorical-test.csv') 14 | Xte = df.iloc[:, 1:] 15 | 16 | clf = PMMLSVC(pmml="models/svc-cat-pima.pmml") 17 | clf.predict(Xte) 18 | ``` -------------------------------------------------------------------------------- /sklearn_pmml_model/svm/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`sklearn.svm` module includes Support Vector Machine algorithms. 3 | """ 4 | 5 | # License: BSD 2-Clause 6 | 7 | from ._classes import PMMLLinearSVC, PMMLLinearSVR, PMMLNuSVC, PMMLNuSVR, PMMLSVC, PMMLSVR 8 | 9 | __all__ = ['PMMLLinearSVC', 'PMMLLinearSVR', 'PMMLNuSVC', 'PMMLNuSVR', 'PMMLSVC', 'PMMLSVR'] 10 | -------------------------------------------------------------------------------- /sklearn_pmml_model/svm/_base.py: -------------------------------------------------------------------------------- 1 | # License: BSD 2-Clause 2 | 3 | from sklearn_pmml_model.base import PMMLBaseRegressor, PMMLBaseClassifier, parse_array 4 | import numpy as np 5 | 6 | 7 | class PMMLBaseSVM: 8 | """ 9 | Abstract class for Support Vector Machines. 10 | 11 | The PMML model consists out of a element, 12 | containing a element that contains a 13 | element describing support vectors, and a element describing 14 | the coefficients for each support vector. Support vectors are referenced from 15 | a element, in which the true support vectors are described 16 | using elements. Furthermore, the model contains one out of 17 | , , or 18 | describing the kernel function used. 19 | 20 | Parameters 21 | ---------- 22 | pmml : str, object 23 | Filename or file object containing PMML data. 24 | 25 | Notes 26 | ----- 27 | Specification: http://dmg.org/pmml/v4-3/SupportVectorMachineModel.html 28 | 29 | """ 30 | 31 | def __init__(self): 32 | # Import coefficients and intercepts 33 | model = self.root.find('SupportVectorMachineModel') 34 | 35 | if model is None: 36 | raise Exception('PMML model does not contain SupportVectorMachineModel.') 37 | 38 | vector_dictionary = model.find('VectorDictionary') 39 | svms = model.findall('SupportVectorMachine') 40 | coefficients = [svm.find('Coefficients') for svm in svms] 41 | 42 | self.shape_fit_ = (0, len(vector_dictionary.find('VectorFields'))) 43 | self.support_ = np.array([ 44 | int(x.get('id')) 45 | for x in vector_dictionary.findall('VectorInstance') 46 | ]).astype(np.int32) 47 | 48 | classes = [None, None] if isinstance(self, PMMLBaseRegressor) else self.classes_ 49 | 50 | self._n_support = np.array([ 51 | len(get_overlapping_vectors(get_alt_svms(svms, classes, c))) 52 | for c in classes 53 | ]).astype(np.int32) 54 | 55 | self.support_vectors_ = np.array([ 56 | get_vectors(vector_dictionary, s) for s in self.support_ 57 | ]) 58 | 59 | self._intercept_ = self.intercept_ = np.array([float(cs.get('absoluteValue')) for cs in coefficients]) 60 | self._dual_coef_ = self.dual_coef_ = np.array( 61 | get_coefficients(classes, self._n_support, self.support_, svms) 62 | ) 63 | 64 | if isinstance(self, PMMLBaseClassifier) and len(classes) == 2: 65 | self._n_support = (self._n_support / 2).astype(np.int32) 66 | 67 | linear = model.find('LinearKernelType') 68 | poly = model.find('PolynomialKernelType') 69 | rbf = model.find('RadialBasisKernelType') 70 | sigmoid = model.find('SigmoidKernelType') 71 | 72 | if linear is not None: 73 | self.kernel = 'linear' 74 | self._gamma = self.gamma = 0.0 75 | elif poly is not None: 76 | self.kernel = 'poly' 77 | self._gamma = self.gamma = float(poly.get('gamma')) 78 | self.coef0 = float(poly.get('coef0')) 79 | self.degree = int(poly.get('degree')) 80 | elif rbf is not None: 81 | self.kernel = 'rbf' 82 | self._gamma = self.gamma = float(rbf.get('gamma')) 83 | elif sigmoid is not None: 84 | self.kernel = 'sigmoid' 85 | self._gamma = self.gamma = float(sigmoid.get('gamma')) 86 | self.coef0 = float(sigmoid.get('coef0')) 87 | 88 | self._probA = np.array([]) 89 | self._probB = np.array([]) 90 | 91 | 92 | def get_vectors(vector_dictionary, s): 93 | """Return support vector values, parsed as a numpy array.""" 94 | instance = vector_dictionary.find(f"VectorInstance[@id='{s}']") 95 | 96 | if instance is None: 97 | raise Exception(f'PMML model is broken, vector instance (id = {s}) not found.') 98 | 99 | array = instance.find('Array') 100 | if array is None: 101 | array = instance.find('REAL-Array') 102 | if array is None: 103 | array = instance.find('SparseArray') 104 | if array is None: 105 | array = instance.find('REAL-SparseArray') 106 | if array is None: 107 | raise Exception(f'PMML model is broken, vector instance (id = {s}) does not contain (Sparse)Array element.') 108 | 109 | return np.array(parse_array(array)) 110 | 111 | 112 | def get_alt_svms(svms, classes, target_class): 113 | """ 114 | Find alternative SVMs (e.g., for target class 0, find the svms classifying 0 against 1, and 0 against 2). 115 | 116 | Parameters 117 | ---------- 118 | svms : list 119 | List of eTree.Element objects describing the different one-to-one support vector machines in the PMML. 120 | 121 | classes : numpy.array 122 | The classes to be predicted by the model. 123 | 124 | target_class : str 125 | The target class. 126 | 127 | Returns 128 | ------- 129 | alt_svms : list 130 | List of eTree.Elements filtered to only include SVMs comparing the target class against alternate classes. 131 | 132 | """ 133 | # Noop for regression 134 | if classes[0] is None: 135 | return svms 136 | 137 | alt_svms = [ 138 | svm for svm in svms 139 | if svm.get('targetCategory') == str(target_class) or svm.get('alternateTargetCategory') == str(target_class) 140 | ] 141 | 142 | # Sort svms based on target class order 143 | alt_svms = [ 144 | next(svm for svm in alt_svms if svm.get('targetCategory') == str(c) or svm.get('alternateTargetCategory') == str(c)) 145 | for c in set(classes).difference({target_class}) 146 | ] 147 | 148 | return alt_svms 149 | 150 | 151 | def get_overlapping_vectors(svms): 152 | """ 153 | Return support vector ids that are present in all provided SVM elements. 154 | 155 | Parameters 156 | ---------- 157 | svms : list 158 | List of eTree.Element objects describing the different one-to-one support vector machines in the PMML. 159 | 160 | Returns 161 | ------- 162 | output : set 163 | Set containing all integer vector ids that are present in all provided SVM elements. 164 | 165 | """ 166 | support_vectors = [svm.find('SupportVectors') for svm in svms] 167 | vector_ids = [{int(x.get('vectorId')) for x in s.findall('SupportVector')} for s in support_vectors] 168 | return set.intersection(*vector_ids) 169 | 170 | 171 | def get_coefficients(classes, n_support, support_ids, svms): 172 | """ 173 | Return support vector coefficients. 174 | 175 | Parameters 176 | ---------- 177 | classes : numpy.array 178 | The classes to be predicted by the model. 179 | 180 | n_support : numpy.array 181 | Numpy array describing the number of support vectors for each class. 182 | 183 | support_ids: list 184 | A list describing the ids of all support vectors in the model. 185 | 186 | svms : list 187 | List of eTree.Element objects describing the different one-to-one support vector machines in the PMML. 188 | 189 | """ 190 | dual_coef = np.zeros((len(classes) - 1, len(support_ids))) 191 | 192 | for i, x in enumerate(classes): 193 | alt_svms = get_alt_svms(svms, classes, x) 194 | offsets = [0] + np.cumsum(n_support).tolist() 195 | 196 | for j, svm in enumerate(alt_svms): 197 | start = offsets[i] 198 | end = offsets[i + 1] 199 | ids = support_ids[start:end] 200 | 201 | support_vectors = [int(x.get('vectorId')) for x in svm.find('SupportVectors').findall('SupportVector')] 202 | coefficients = [float(x.get('value')) for x in svm.find('Coefficients').findall('Coefficient')] 203 | indices = [support_vectors.index(x) for x in ids] 204 | dual_coef[j, start:end] = np.array(coefficients)[indices] 205 | 206 | return dual_coef 207 | -------------------------------------------------------------------------------- /sklearn_pmml_model/svm/_classes.py: -------------------------------------------------------------------------------- 1 | # License: BSD 2-Clause 2 | 3 | from sklearn.svm import LinearSVC, LinearSVR, NuSVC, NuSVR, SVC, SVR 4 | import numpy as np 5 | from scipy.sparse import isspmatrix 6 | from sklearn_pmml_model.base import OneHotEncodingMixin, PMMLBaseClassifier, PMMLBaseRegressor 7 | from sklearn_pmml_model.svm._base import PMMLBaseSVM 8 | from sklearn_pmml_model.linear_model.implementations import _get_coefficients as _linear_get_coefficients 9 | 10 | 11 | class PMMLLinearSVC(OneHotEncodingMixin, PMMLBaseClassifier, LinearSVC): 12 | """ 13 | Linear Support Vector Classification. 14 | 15 | Similar to SVC with parameter kernel='linear', but implemented in terms of 16 | liblinear rather than libsvm, so it has more flexibility in the choice of 17 | penalties and loss functions and should scale better to large numbers of 18 | samples. 19 | 20 | This class supports both dense and sparse input and the multiclass support 21 | is handled according to a one-vs-the-rest scheme. 22 | 23 | The PMML model is assumed to be equivalent to PMMLLogisticRegression. 24 | 25 | Parameters 26 | ---------- 27 | pmml : str, object 28 | Filename or file object containing PMML data. 29 | 30 | Notes 31 | ----- 32 | Specification: http://dmg.org/pmml/v4-3/Regression.html 33 | 34 | """ 35 | 36 | def __init__(self, pmml): 37 | PMMLBaseClassifier.__init__(self, pmml) 38 | OneHotEncodingMixin.__init__(self) 39 | LinearSVC.__init__(self) 40 | 41 | # Import coefficients and intercepts 42 | model = self.root.find('RegressionModel') 43 | 44 | if model is None: 45 | raise Exception('PMML model does not contain RegressionModel.') 46 | 47 | tables = [ 48 | table for table in model.findall('RegressionTable') 49 | if table.find('NumericPredictor') is not None 50 | ] 51 | 52 | self.coef_ = [ 53 | _linear_get_coefficients(self, table) 54 | for table in tables 55 | ] 56 | self.intercept_ = [ 57 | float(table.get('intercept')) 58 | for table in tables 59 | ] 60 | 61 | if len(self.coef_) == 1: 62 | self.coef_ = [self.coef_[0]] 63 | 64 | if len(self.intercept_) == 1: 65 | self.intercept_ = [self.intercept_[0]] 66 | 67 | self.coef_ = np.array(self.coef_) 68 | self.intercept_ = np.array(self.intercept_) 69 | 70 | def fit(self, x, y): 71 | return PMMLBaseClassifier.fit(self, x, y) 72 | 73 | def _more_tags(self): 74 | return LinearSVC._more_tags(self) 75 | 76 | 77 | class PMMLLinearSVR(OneHotEncodingMixin, PMMLBaseRegressor, LinearSVR): 78 | """ 79 | Linear Support Vector Regression. 80 | 81 | Similar to SVR with parameter kernel='linear', but implemented in terms of 82 | liblinear rather than libsvm, so it has more flexibility in the choice of 83 | penalties and loss functions and should scale better to large numbers of 84 | samples. 85 | 86 | This class supports both dense and sparse input. 87 | 88 | The PMML model is assumed to be equivalent to PMMLLinearRegression. 89 | 90 | Parameters 91 | ---------- 92 | pmml : str, object 93 | Filename or file object containing PMML data. 94 | 95 | Notes 96 | ----- 97 | Specification: http://dmg.org/pmml/v4-3/Regression.html 98 | 99 | """ 100 | 101 | def __init__(self, pmml): 102 | PMMLBaseRegressor.__init__(self, pmml) 103 | OneHotEncodingMixin.__init__(self) 104 | 105 | # Import coefficients and intercepts 106 | model = self.root.find('RegressionModel') 107 | 108 | if model is None: 109 | raise Exception('PMML model does not contain RegressionModel.') 110 | 111 | tables = model.findall('RegressionTable') 112 | 113 | self.coef_ = np.array([ 114 | _linear_get_coefficients(self, table) 115 | for table in tables 116 | ]) 117 | self.intercept_ = np.array([ 118 | float(table.get('intercept')) 119 | for table in tables 120 | ]) 121 | 122 | if self.coef_.shape[0] == 1: 123 | self.coef_ = self.coef_[0] 124 | 125 | if self.intercept_.shape[0] == 1: 126 | self.intercept_ = self.intercept_[0] 127 | 128 | def fit(self, x, y): 129 | return PMMLBaseRegressor.fit(self, x, y) 130 | 131 | def _more_tags(self): 132 | return LinearSVR._more_tags(self) 133 | 134 | 135 | class PMMLNuSVC(OneHotEncodingMixin, PMMLBaseClassifier, PMMLBaseSVM, NuSVC): 136 | """ 137 | Nu-Support Vector Classification. 138 | 139 | Similar to SVC but uses a parameter to control the number of support 140 | vectors. 141 | 142 | The implementation is based on libsvm. 143 | 144 | Parameters 145 | ---------- 146 | pmml : str, object 147 | Filename or file object containing PMML data. 148 | 149 | Notes 150 | ----- 151 | Specification: http://dmg.org/pmml/v4-3/SupportVectorMachine.html 152 | 153 | """ 154 | 155 | def __init__(self, pmml): 156 | PMMLBaseClassifier.__init__(self, pmml) 157 | OneHotEncodingMixin.__init__(self) 158 | NuSVC.__init__(self) 159 | PMMLBaseSVM.__init__(self) 160 | 161 | def _prepare_data(self, X): 162 | self._sparse = isspmatrix(X) 163 | return super()._prepare_data(X) 164 | 165 | def decision_function(self, X, *args, **kwargs): 166 | X = self._prepare_data(X) 167 | return super().decision_function(X, *args, **kwargs) 168 | 169 | def fit(self, x, y): 170 | return PMMLBaseClassifier.fit(self, x, y) 171 | 172 | def _more_tags(self): 173 | return NuSVC._more_tags(self) 174 | 175 | 176 | class PMMLNuSVR(OneHotEncodingMixin, PMMLBaseRegressor, PMMLBaseSVM, NuSVR): 177 | """ 178 | Nu Support Vector Regression. 179 | 180 | Similar to NuSVC, for regression, uses a parameter nu to control 181 | the number of support vectors. However, unlike NuSVC, where nu 182 | replaces C, here nu replaces the parameter epsilon of epsilon-SVR. 183 | 184 | The implementation is based on libsvm. 185 | 186 | Parameters 187 | ---------- 188 | pmml : str, object 189 | Filename or file object containing PMML data. 190 | 191 | Notes 192 | ----- 193 | Specification: http://dmg.org/pmml/v4-3/SupportVectorMachine.html 194 | 195 | """ 196 | 197 | def __init__(self, pmml): 198 | PMMLBaseRegressor.__init__(self, pmml) 199 | OneHotEncodingMixin.__init__(self) 200 | NuSVR.__init__(self) 201 | PMMLBaseSVM.__init__(self) 202 | 203 | def _prepare_data(self, X): 204 | self._sparse = isspmatrix(X) 205 | return super()._prepare_data(X) 206 | 207 | def fit(self, x, y): 208 | return PMMLBaseRegressor.fit(self, x, y) 209 | 210 | def _more_tags(self): 211 | return NuSVR._more_tags(self) 212 | 213 | 214 | class PMMLSVC(OneHotEncodingMixin, PMMLBaseClassifier, PMMLBaseSVM, SVC): 215 | """ 216 | C-Support Vector Classification. 217 | 218 | The implementation is based on libsvm. The multiclass support is 219 | handled according to a one-vs-one scheme. 220 | 221 | For details on the precise mathematical formulation of the provided 222 | kernel functions and how `gamma`, `coef0` and `degree` affect each 223 | other, see the corresponding section in the narrative documentation: 224 | `Kernel functions `_. 225 | 226 | Parameters 227 | ---------- 228 | pmml : str, object 229 | Filename or file object containing PMML data. 230 | 231 | Notes 232 | ----- 233 | Specification: http://dmg.org/pmml/v4-3/SupportVectorMachine.html 234 | 235 | """ 236 | 237 | def __init__(self, pmml): 238 | PMMLBaseClassifier.__init__(self, pmml) 239 | OneHotEncodingMixin.__init__(self) 240 | SVC.__init__(self) 241 | PMMLBaseSVM.__init__(self) 242 | 243 | def _prepare_data(self, X): 244 | self._sparse = isspmatrix(X) 245 | return super()._prepare_data(X) 246 | 247 | def decision_function(self, X, *args, **kwargs): 248 | X = self._prepare_data(X) 249 | return super().decision_function(X, *args, **kwargs) 250 | 251 | def fit(self, x, y): 252 | return PMMLBaseClassifier.fit(self, x, y) 253 | 254 | def _more_tags(self): 255 | return SVC._more_tags(self) 256 | 257 | 258 | class PMMLSVR(OneHotEncodingMixin, PMMLBaseRegressor, PMMLBaseSVM, SVR): 259 | """ 260 | Epsilon-Support Vector Regression. 261 | 262 | The free parameters in the model are C and epsilon. The implementation 263 | is based on libsvm. 264 | 265 | For details on the precise mathematical formulation of the provided 266 | kernel functions and how `gamma`, `coef0` and `degree` affect each 267 | other, see the corresponding section in the narrative documentation: 268 | `Kernel functions `_. 269 | 270 | Parameters 271 | ---------- 272 | pmml : str, object 273 | Filename or file object containing PMML data. 274 | 275 | Notes 276 | ----- 277 | Specification: http://dmg.org/pmml/v4-3/SupportVectorMachine.html 278 | 279 | """ 280 | 281 | def __init__(self, pmml): 282 | PMMLBaseRegressor.__init__(self, pmml) 283 | OneHotEncodingMixin.__init__(self) 284 | SVR.__init__(self) 285 | PMMLBaseSVM.__init__(self) 286 | 287 | def _prepare_data(self, X): 288 | self._sparse = isspmatrix(X) 289 | return super()._prepare_data(X) 290 | 291 | def fit(self, x, y): 292 | return PMMLBaseRegressor.fit(self, x, y) 293 | 294 | def _more_tags(self): 295 | return SVR._more_tags(self) 296 | -------------------------------------------------------------------------------- /sklearn_pmml_model/tree/README.md: -------------------------------------------------------------------------------- 1 | # sklearn-pmml-model.tree 2 | 3 | This package contains the `PMMLTreeClassifier`. 4 | 5 | ## Example 6 | A minimal working example is shown below: 7 | 8 | ```python 9 | from sklearn.datasets import load_iris 10 | from sklearn.model_selection import train_test_split 11 | import pandas as pd 12 | import numpy as np 13 | from sklearn_pmml_model.tree import PMMLTreeClassifier 14 | 15 | # Prepare data 16 | iris = load_iris() 17 | X = pd.DataFrame(iris.data) 18 | X.columns = np.array(iris.feature_names) 19 | y = pd.Series(np.array(iris.target_names)[iris.target]) 20 | y.name = "Class" 21 | Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.33, random_state=123) 22 | 23 | clf = PMMLTreeClassifier(pmml="models/tree-iris.pmml") 24 | clf.predict(Xte) 25 | clf.score(Xte, yte) 26 | ``` 27 | 28 | To interpret the resulting tree, including categorical spits, we adapted the example from https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html: 29 | 30 | ```python 31 | node_indicator = clf.decision_path(X) 32 | leaf_id = clf.apply(X) 33 | 34 | sample_id = 0 35 | # obtain ids of the nodes `sample_id` goes through, i.e., row `sample_id` 36 | node_index = node_indicator.indices[node_indicator.indptr[sample_id]: 37 | node_indicator.indptr[sample_id + 1]] 38 | 39 | print('Rules used to predict sample {id}:\n'.format(id=sample_id)) 40 | for node_id in node_index: 41 | # continue to the next node if it is a leaf node 42 | if leaf_id[sample_id] == node_id: 43 | continue 44 | 45 | # check if value of the split feature for sample 0 is below threshold 46 | if isinstance(clf.tree_.threshold[node_id], list): 47 | threshold_sign = "in" 48 | elif (X.iloc[sample_id, clf.tree_.feature[node_id]] <= clf.tree_.threshold[node_id]): 49 | threshold_sign = "<=" 50 | else: 51 | threshold_sign = ">" 52 | 53 | print("decision node {node} : (X[{sample}, {feature}] = {value}) " 54 | "{inequality} {threshold})".format( 55 | node=node_id, 56 | sample=sample_id, 57 | feature=clf.tree_.feature[node_id], 58 | value=X.iloc[sample_id, clf.tree_.feature[node_id]], 59 | inequality=threshold_sign, 60 | threshold=str(clf.tree_.threshold[node_id]))) 61 | ``` -------------------------------------------------------------------------------- /sklearn_pmml_model/tree/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`sklearn_pmml_model.tree` module includes decision tree-based models for 3 | classification and regression. 4 | """ 5 | 6 | # License: BSD 2-Clause 7 | 8 | from .tree import PMMLTreeClassifier, PMMLTreeRegressor, get_tree, clone 9 | 10 | __all__ = ['PMMLTreeClassifier', 'PMMLTreeRegressor', 'get_tree', 'clone'] 11 | -------------------------------------------------------------------------------- /sklearn_pmml_model/tree/_criterion.pxd: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | 3 | # Authors: Gilles Louppe 4 | # Peter Prettenhofer 5 | # Brian Holt 6 | # Joel Nothman 7 | # Arnaud Joly 8 | # Jacob Schreiber 9 | # 10 | # License: BSD 3 clause 11 | 12 | # See _criterion.pyx for implementation details. 13 | 14 | import numpy as np 15 | cimport numpy as np 16 | 17 | ctypedef np.npy_float32 DTYPE_t # Type of X 18 | ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight 19 | ctypedef np.npy_intp SIZE_t # Type for indices and counters 20 | ctypedef np.npy_int32 INT32_t # Signed 32 bit integer 21 | ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer 22 | 23 | cdef class Criterion: 24 | # The criterion computes the impurity of a node and the reduction of 25 | # impurity of a split on that node. It also computes the output statistics 26 | # such as the mean in regression and class probabilities in classification. 27 | 28 | # Internal structures 29 | cdef DOUBLE_t* y # Values of y 30 | cdef SIZE_t y_stride # Stride in y (since n_outputs >= 1) 31 | cdef DOUBLE_t* sample_weight # Sample weights 32 | 33 | cdef SIZE_t* samples # Sample indices in X, y 34 | cdef SIZE_t start # samples[start:pos] are the samples in the left node 35 | cdef SIZE_t pos # samples[pos:end] are the samples in the right node 36 | cdef SIZE_t end 37 | 38 | cdef SIZE_t n_outputs # Number of outputs 39 | cdef SIZE_t n_samples # Number of samples 40 | cdef SIZE_t n_node_samples # Number of samples in the node (end-start) 41 | cdef double weighted_n_samples # Weighted number of samples (in total) 42 | cdef double weighted_n_node_samples # Weighted number of samples in the node 43 | cdef double weighted_n_left # Weighted number of samples in the left node 44 | cdef double weighted_n_right # Weighted number of samples in the right node 45 | 46 | cdef double* sum_total # For classification criteria, the sum of the 47 | # weighted count of each label. For regression, 48 | # the sum of w*y. sum_total[k] is equal to 49 | # sum_{i=start}^{end-1} w[samples[i]]*y[samples[i], k], 50 | # where k is output index. 51 | cdef double* sum_left # Same as above, but for the left side of the split 52 | cdef double* sum_right # same as above, but for the right side of the split 53 | 54 | # The criterion object is maintained such that left and right collected 55 | # statistics correspond to samples[start:pos] and samples[pos:end]. 56 | 57 | # Methods 58 | cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight, 59 | double weighted_n_samples, SIZE_t* samples, SIZE_t start, 60 | SIZE_t end) except -1 nogil 61 | cdef int reset(self) except -1 nogil 62 | cdef int reverse_reset(self) except -1 nogil 63 | cdef int update(self, SIZE_t new_pos) except -1 nogil 64 | cdef double node_impurity(self) nogil 65 | cdef void children_impurity(self, double* impurity_left, 66 | double* impurity_right) nogil 67 | cdef void node_value(self, double* dest) nogil 68 | cdef double impurity_improvement(self, double impurity) nogil 69 | cdef double proxy_impurity_improvement(self) nogil 70 | 71 | cdef class ClassificationCriterion(Criterion): 72 | """Abstract criterion for classification.""" 73 | 74 | cdef SIZE_t* n_classes 75 | cdef SIZE_t sum_stride 76 | 77 | cdef class RegressionCriterion(Criterion): 78 | """Abstract regression criterion.""" 79 | 80 | cdef double sq_sum_total 81 | -------------------------------------------------------------------------------- /sklearn_pmml_model/tree/_splitter.pxd: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | 3 | # Authors: Gilles Louppe 4 | # Peter Prettenhofer 5 | # Brian Holt 6 | # Joel Nothman 7 | # Arnaud Joly 8 | # Jacob Schreiber 9 | # 10 | # License: BSD 3 clause 11 | 12 | # See _splitter.pyx for details. 13 | 14 | import numpy as np 15 | cimport numpy as np 16 | 17 | from ._utils cimport SplitValue, SplitRecord 18 | 19 | from ._criterion cimport Criterion 20 | 21 | ctypedef np.npy_float32 DTYPE_t # Type of X 22 | ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight 23 | ctypedef np.npy_intp SIZE_t # Type for indices and counters 24 | ctypedef np.npy_int32 INT32_t # Signed 32 bit integer 25 | ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer 26 | ctypedef np.npy_uint64 UINT64_t # Unsigned 64 bit integer 27 | 28 | cdef class Splitter: 29 | # The splitter searches in the input space for a feature and a threshold 30 | # to split the samples samples[start:end]. 31 | # 32 | # The impurity computations are delegated to a criterion object. 33 | 34 | # Internal structures 35 | cdef public Criterion criterion # Impurity criterion 36 | cdef public SIZE_t max_features # Number of features to test 37 | cdef public SIZE_t min_samples_leaf # Min samples in a leaf 38 | cdef public double min_weight_leaf # Minimum weight in a leaf 39 | 40 | cdef object random_state # Random state 41 | cdef UINT32_t rand_r_state # sklearn_rand_r random number state 42 | 43 | cdef SIZE_t* samples # Sample indices in X, y 44 | cdef SIZE_t n_samples # X.shape[0] 45 | cdef double weighted_n_samples # Weighted number of samples 46 | cdef SIZE_t* features # Feature indices in X 47 | cdef SIZE_t* constant_features # Constant features indices 48 | cdef SIZE_t n_features # X.shape[1] 49 | cdef DTYPE_t* feature_values # temp. array holding feature values 50 | 51 | cdef SIZE_t start # Start position for the current node 52 | cdef SIZE_t end # End position for the current node 53 | 54 | cdef bint presort # Whether to use presorting, only 55 | # allowed on dense data 56 | cdef bint breiman_shortcut # Whether decision trees are allowed to use the 57 | # Breiman shortcut for categorical features 58 | 59 | cdef DOUBLE_t* y 60 | cdef SIZE_t y_stride 61 | cdef DOUBLE_t* sample_weight 62 | cdef INT32_t *n_categories # (n_features,) array giving number of 63 | # categories (<0 for non-categorical) 64 | cdef UINT32_t* cat_cache # Cache buffer for fast categorical split evaluation 65 | 66 | # The samples vector `samples` is maintained by the Splitter object such 67 | # that the samples contained in a node are contiguous. With this setting, 68 | # `node_split` reorganizes the node samples `samples[start:end]` in two 69 | # subsets `samples[start:pos]` and `samples[pos:end]`. 70 | 71 | # The 1-d `features` array of size n_features contains the features 72 | # indices and allows fast sampling without replacement of features. 73 | 74 | # The 1-d `constant_features` array of size n_features holds in 75 | # `constant_features[:n_constant_features]` the feature ids with 76 | # constant values for all the samples that reached a specific node. 77 | # The value `n_constant_features` is given by the parent node to its 78 | # child nodes. The content of the range `[n_constant_features:]` is left 79 | # undefined, but preallocated for performance reasons 80 | # This allows optimization with depth-based tree building. 81 | 82 | # Methods 83 | cdef int init(self, object X, np.ndarray y, 84 | DOUBLE_t* sample_weight, 85 | INT32_t* n_categories, 86 | np.ndarray X_idx_sorted=*) except -1 87 | 88 | cdef int node_reset(self, SIZE_t start, SIZE_t end, 89 | double* weighted_n_node_samples) except -1 nogil 90 | 91 | cdef int node_split(self, 92 | double impurity, # Impurity of the node 93 | SplitRecord* split, 94 | SIZE_t* n_constant_features) except -1 nogil 95 | 96 | cdef void node_value(self, double* dest) nogil 97 | 98 | cdef double node_impurity(self) nogil 99 | -------------------------------------------------------------------------------- /sklearn_pmml_model/tree/_tree.pxd: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | 3 | # Authors: Gilles Louppe 4 | # Peter Prettenhofer 5 | # Brian Holt 6 | # Joel Nothman 7 | # Arnaud Joly 8 | # Jacob Schreiber 9 | # Nelson Liu 10 | # 11 | # License: BSD 3 clause 12 | 13 | # See _tree.pyx for details. 14 | 15 | import numpy as np 16 | cimport numpy as np 17 | 18 | ctypedef np.npy_float32 DTYPE_t # Type of X 19 | ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight 20 | ctypedef np.npy_intp SIZE_t # Type for indices and counters 21 | ctypedef np.npy_int32 INT32_t # Signed 32 bit integer 22 | ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer 23 | 24 | from ._utils cimport SplitValue 25 | from ._utils cimport SplitRecord 26 | from ._utils cimport Node 27 | from ._splitter cimport Splitter 28 | 29 | 30 | cdef class CategoryCacheMgr: 31 | # Class to manage the category cache memory during Tree.apply() 32 | 33 | cdef SIZE_t n_nodes 34 | cdef UINT32_t **bits 35 | 36 | cdef void populate(self, Node *nodes, SIZE_t n_nodes, INT32_t *n_categories) 37 | 38 | 39 | cdef class Tree: 40 | # The Tree object is a binary tree structure constructed by the 41 | # TreeBuilder. The tree structure is used for predictions and 42 | # feature importances. 43 | 44 | # Input/Output layout 45 | cdef public SIZE_t n_features # Number of features in X 46 | cdef SIZE_t* n_classes # Number of classes in y[:, k] 47 | cdef public SIZE_t n_outputs # Number of outputs in y 48 | cdef public SIZE_t max_n_classes # max(n_classes) 49 | 50 | # Inner structures: values are stored separately from node structure, 51 | # since size is determined at runtime. 52 | cdef public SIZE_t max_depth # Max depth of the tree 53 | cdef public SIZE_t node_count # Counter for node IDs 54 | cdef public SIZE_t capacity # Capacity of tree, in terms of nodes 55 | cdef Node* nodes # Array of nodes 56 | cdef double* value # (capacity, n_outputs, max_n_classes) array of values 57 | cdef SIZE_t value_stride # = n_outputs * max_n_classes 58 | cdef INT32_t *n_categories # (n_features,) array giving number of 59 | # categories (<0 for non-categorical) 60 | 61 | # Methods 62 | cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, 63 | SIZE_t feature, SplitValue split_value, double impurity, 64 | SIZE_t n_node_samples, 65 | double weighted_n_samples) except -1 nogil 66 | cdef int _resize(self, SIZE_t capacity) except -1 nogil 67 | cdef int _resize_c(self, SIZE_t capacity=*) except -1 nogil 68 | 69 | cdef np.ndarray _get_value_ndarray(self) 70 | cdef np.ndarray _get_node_ndarray(self) 71 | 72 | cpdef np.ndarray predict(self, object X) 73 | 74 | cpdef np.ndarray apply(self, object X) 75 | cdef np.ndarray _apply_dense(self, object X) 76 | cdef np.ndarray _apply_sparse_csr(self, object X) 77 | 78 | cpdef object decision_path(self, object X) 79 | cdef object _decision_path_dense(self, object X) 80 | cdef object _decision_path_sparse_csr(self, object X) 81 | 82 | cpdef compute_feature_importances(self, normalize=*) 83 | 84 | 85 | # ============================================================================= 86 | # Tree builder 87 | # ============================================================================= 88 | 89 | cdef class TreeBuilder: 90 | # The TreeBuilder recursively builds a Tree object from training samples, 91 | # using a Splitter object for splitting internal nodes and assigning 92 | # values to leaves. 93 | # 94 | # This class controls the various stopping criteria and the node splitting 95 | # evaluation order, e.g. depth-first or best-first. 96 | 97 | cdef Splitter splitter # Splitting algorithm 98 | 99 | cdef SIZE_t min_samples_split # Minimum number of samples in an internal node 100 | cdef SIZE_t min_samples_leaf # Minimum number of samples in a leaf 101 | cdef double min_weight_leaf # Minimum weight in a leaf 102 | cdef SIZE_t max_depth # Maximal tree depth 103 | cdef double min_impurity_split 104 | cdef double min_impurity_decrease # Impurity threshold for early stopping 105 | 106 | cpdef build(self, Tree tree, object X, np.ndarray y, 107 | np.ndarray sample_weight=*, 108 | np.ndarray n_categories=*, 109 | np.ndarray X_idx_sorted=*) 110 | cdef _check_input(self, object X, np.ndarray y, np.ndarray sample_weight) 111 | -------------------------------------------------------------------------------- /sklearn_pmml_model/tree/quad_tree.pxd: -------------------------------------------------------------------------------- 1 | # cython: boundscheck=False 2 | # cython: wraparound=False 3 | # cython: cdivision=True 4 | # Author: Thomas Moreau 5 | # Author: Olivier Grisel 6 | 7 | # See quad_tree.pyx for details. 8 | 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | ctypedef np.npy_float32 DTYPE_t # Type of X 13 | ctypedef np.npy_intp SIZE_t # Type for indices and counters 14 | ctypedef np.npy_int32 INT32_t # Signed 32 bit integer 15 | ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer 16 | 17 | # This is effectively an ifdef statement in Cython 18 | # It allows us to write printf debugging lines 19 | # and remove them at compile time 20 | cdef enum: 21 | DEBUGFLAG = 0 22 | 23 | cdef float EPSILON = 1e-6 24 | 25 | # XXX: Careful to not change the order of the arguments. It is important to 26 | # have is_leaf and max_width consecutive as it permits to avoid padding by 27 | # the compiler and keep the size coherent for both C and numpy data structures. 28 | cdef struct Cell: 29 | # Base storage structure for cells in a QuadTree object 30 | 31 | # Tree structure 32 | SIZE_t parent # Parent cell of this cell 33 | SIZE_t[8] children # Array pointing to childrens of this cell 34 | 35 | # Cell description 36 | SIZE_t cell_id # Id of the cell in the cells array in the Tree 37 | SIZE_t point_index # Index of the point at this cell (only defined 38 | # in non empty leaf) 39 | bint is_leaf # Does this cell have children? 40 | DTYPE_t squared_max_width # Squared value of the maximum width w 41 | SIZE_t depth # Depth of the cell in the tree 42 | SIZE_t cumulative_size # Number of points included in the subtree with 43 | # this cell as a root. 44 | 45 | # Internal constants 46 | DTYPE_t[3] center # Store the center for quick split of cells 47 | DTYPE_t[3] barycenter # Keep track of the center of mass of the cell 48 | 49 | # Cell boundaries 50 | DTYPE_t[3] min_bounds # Inferior boundaries of this cell (inclusive) 51 | DTYPE_t[3] max_bounds # Superior boundaries of this cell (exclusive) 52 | 53 | 54 | cdef class _QuadTree: 55 | # The QuadTree object is a quad tree structure constructed by inserting 56 | # recursively points in the tree and splitting cells in 4 so that each 57 | # leaf cell contains at most one point. 58 | # This structure also handle 3D data, inserted in trees with 8 children 59 | # for each node. 60 | 61 | # Parameters of the tree 62 | cdef public int n_dimensions # Number of dimensions in X 63 | cdef public int verbose # Verbosity of the output 64 | cdef SIZE_t n_cells_per_cell # Number of children per node. (2 ** n_dimension) 65 | 66 | # Tree inner structure 67 | cdef public SIZE_t max_depth # Max depth of the tree 68 | cdef public SIZE_t cell_count # Counter for node IDs 69 | cdef public SIZE_t capacity # Capacity of tree, in terms of nodes 70 | cdef public SIZE_t n_points # Total number of points 71 | cdef Cell* cells # Array of nodes 72 | 73 | # Point insertion methods 74 | cdef int insert_point(self, DTYPE_t[3] point, SIZE_t point_index, 75 | SIZE_t cell_id=*) except -1 nogil 76 | cdef SIZE_t _insert_point_in_new_child(self, DTYPE_t[3] point, Cell* cell, 77 | SIZE_t point_index, SIZE_t size=* 78 | ) nogil 79 | cdef SIZE_t _select_child(self, DTYPE_t[3] point, Cell* cell) nogil 80 | cdef bint _is_duplicate(self, DTYPE_t[3] point1, DTYPE_t[3] point2) nogil 81 | 82 | # Create a summary of the Tree compare to a query point 83 | cdef long summarize(self, DTYPE_t[3] point, DTYPE_t* results, 84 | float squared_theta=*, SIZE_t cell_id=*, long idx=* 85 | ) noexcept nogil 86 | 87 | # Internal cell initialization methods 88 | cdef void _init_cell(self, Cell* cell, SIZE_t parent, SIZE_t depth) nogil 89 | cdef void _init_root(self, DTYPE_t[3] min_bounds, DTYPE_t[3] max_bounds 90 | ) nogil 91 | 92 | # Private methods 93 | cdef int _check_point_in_cell(self, DTYPE_t[3] point, Cell* cell 94 | ) except -1 nogil 95 | 96 | # Private array manipulation to manage the ``cells`` array 97 | cdef int _resize(self, SIZE_t capacity) except -1 nogil 98 | cdef int _resize_c(self, SIZE_t capacity=*) except -1 nogil 99 | cdef int _get_cell(self, DTYPE_t[3] point, SIZE_t cell_id=*) except -1 nogil 100 | cdef np.ndarray _get_cell_ndarray(self) 101 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamDecode/sklearn-pmml-model/13d992034c29847aa6ed3e377c0eaa5b3366d7cd/tests/__init__.py -------------------------------------------------------------------------------- /tests/naive_bayes/test_naive_bayes.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import sklearn_pmml_model 3 | from sklearn_pmml_model.naive_bayes import PMMLGaussianNB 4 | from sklearn.naive_bayes import GaussianNB 5 | from sklearn.datasets import load_wine 6 | import pandas as pd 7 | import numpy as np 8 | from os import path, remove 9 | from io import StringIO 10 | from sklearn2pmml.pipeline import PMMLPipeline 11 | from sklearn2pmml import sklearn2pmml 12 | 13 | 14 | BASE_DIR = path.dirname(sklearn_pmml_model.__file__) 15 | 16 | 17 | class TestNaiveBayes(TestCase): 18 | def test_invalid_model(self): 19 | with self.assertRaises(Exception) as cm: 20 | PMMLGaussianNB(pmml=StringIO(""" 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | """)) 34 | 35 | assert str(cm.exception) == 'PMML model does not contain NaiveBayesModel.' 36 | 37 | def test_unsupported_distribution(self): 38 | with self.assertRaises(Exception) as cm: 39 | PMMLGaussianNB(pmml=StringIO(""" 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | """)) 65 | 66 | assert str(cm.exception) == 'Distribution "PoissonDistribution" not implemented, or not supported by scikit-learn' 67 | 68 | def test_more_tags(self): 69 | clf = PMMLGaussianNB(path.join(BASE_DIR, '../models/nb-cat-pima.pmml')) 70 | assert clf._more_tags() == GaussianNB()._more_tags() 71 | 72 | 73 | class TestGaussianNBIntegration(TestCase): 74 | def setUp(self): 75 | df = pd.read_csv(path.join(BASE_DIR, '../models/categorical-test.csv')) 76 | Xte = df.iloc[:, 1:] 77 | Xenc = pd.get_dummies(Xte, prefix_sep='') 78 | yte = df.iloc[:, 0] 79 | self.test = (Xte, yte) 80 | self.enc = (Xenc, yte) 81 | 82 | pmml = path.join(BASE_DIR, '../models/nb-cat-pima.pmml') 83 | self.clf = PMMLGaussianNB(pmml) 84 | 85 | self.ref = GaussianNB() 86 | self.ref.fit(Xenc, yte) 87 | 88 | def test_predict_proba(self): 89 | Xte, _ = self.test 90 | ref = np.array([0.089665518, 0.229009345, 0.007881006, 0.025306284, 0.013287187, 0.085741556, 0.338780868, 0.063463670, 0.769219497, 0.100369704, 0.002308186, 0.050380836, 0.054716302, 0.114718523, 0.156496072, 0.076301905, 0.806474996, 0.001227284, 0.121921194, 0.146751623, 0.074212037, 0.084148702, 0.479980587, 0.234470483, 0.354876655, 0.480582547, 0.113901660, 0.969566830, 0.989918477, 0.760519487, 0.599039599, 0.997856475, 0.776102648, 0.863233887, 0.910001902, 0.846005607, 0.734269347, 0.841546008, 0.120615475, 0.457027577, 0.124201960, 0.882691224, 0.930458760, 0.585210046, 0.484105369, 0.697949034, 0.778448666, 0.820806942, 0.074380668, 0.978478762, 0.589284915, 0.586728917]) 91 | assert np.allclose(ref, self.clf.predict_proba(Xte)[:, 0]) 92 | 93 | def test_predict(self): 94 | Xte, _ = self.test 95 | ref = np.array(['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No']) 96 | assert all(ref == self.clf.predict(Xte)) 97 | 98 | def test_score(self): 99 | Xte, yte = self.test 100 | ref = 0.8461538462 101 | assert np.allclose(ref, self.clf.score(Xte, yte)) 102 | 103 | def test_fit_exception(self): 104 | with self.assertRaises(Exception) as cm: 105 | self.clf.fit(np.array([[]]), np.array([])) 106 | 107 | assert str(cm.exception) == 'Not supported.' 108 | 109 | def test_sklearn2pmml(self): 110 | # Export to PMML 111 | pipeline = PMMLPipeline([ 112 | ("classifier", self.ref) 113 | ]) 114 | pipeline.fit(self.enc[0], self.enc[1]) 115 | sklearn2pmml(pipeline, "gnb-sklearn2pmml.pmml", with_repr = True) 116 | 117 | try: 118 | # Import PMML 119 | model = PMMLGaussianNB(pmml='gnb-sklearn2pmml.pmml') 120 | 121 | # Verify classification 122 | Xenc, _ = self.enc 123 | assert np.allclose( 124 | self.ref.predict_proba(Xenc), 125 | model.predict_proba(Xenc) 126 | ) 127 | 128 | finally: 129 | remove("gnb-sklearn2pmml.pmml") 130 | 131 | 132 | class TestGaussianNBWineIntegration(TestCase): 133 | def setUp(self): 134 | df = load_wine(as_frame=True) 135 | Xte = df.data 136 | yte = df.target 137 | self.test = (Xte, yte) 138 | 139 | self.ref = GaussianNB() 140 | self.ref.fit(Xte, yte) 141 | 142 | def test_sklearn2pmml(self): 143 | # Export to PMML 144 | pipeline = PMMLPipeline([ 145 | ("classifier", self.ref) 146 | ]) 147 | pipeline.fit(self.test[0], self.test[1]) 148 | sklearn2pmml(pipeline, "gnb-sklearn2pmml.pmml", with_repr = True) 149 | 150 | try: 151 | # Import PMML 152 | model = PMMLGaussianNB(pmml='gnb-sklearn2pmml.pmml') 153 | 154 | # Verify classification 155 | Xte, _ = self.test 156 | assert np.allclose( 157 | self.ref.predict_proba(Xte), 158 | model.predict_proba(Xte) 159 | ) 160 | 161 | finally: 162 | remove("gnb-sklearn2pmml.pmml") 163 | -------------------------------------------------------------------------------- /tests/neighbors/test_knn.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import sklearn_pmml_model 3 | from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor 4 | from sklearn_pmml_model.neighbors import PMMLKNeighborsClassifier, PMMLKNeighborsRegressor 5 | import pandas as pd 6 | import numpy as np 7 | from os import path, remove 8 | from io import StringIO 9 | from sklearn2pmml.pipeline import PMMLPipeline 10 | from sklearn2pmml import sklearn2pmml 11 | 12 | 13 | BASE_DIR = path.dirname(sklearn_pmml_model.__file__) 14 | 15 | 16 | class TestKNearestNeighbors(TestCase): 17 | def test_invalid_model(self): 18 | with self.assertRaises(Exception) as cm: 19 | PMMLKNeighborsClassifier(pmml=StringIO(""" 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | """)) 33 | 34 | assert str(cm.exception) == 'PMML model does not contain NearestNeighborModel.' 35 | 36 | def test_no_distance_metric(self): 37 | with self.assertRaises(Exception) as cm: 38 | PMMLKNeighborsClassifier(pmml=StringIO(""" 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | """)) 53 | 54 | assert str(cm.exception) == 'PMML model does not contain ComparisonMeasure.' 55 | 56 | def test_unsupported_distance_metric(self): 57 | with self.assertRaises(Exception) as cm: 58 | PMMLKNeighborsClassifier(pmml=StringIO(""" 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | """)) 77 | 78 | assert str(cm.exception) == 'PMML model uses unsupported distance metric: "funkydistance".' 79 | 80 | 81 | class TestKNeighborsClassifierIntegration(TestCase): 82 | def setUp(self): 83 | df = pd.read_csv(path.join(BASE_DIR, '../models/categorical-test.csv')) 84 | cats = np.unique(df['age']) 85 | df['age'] = pd.Categorical(df['age'], categories=cats).codes + 1 86 | Xte = df.iloc[:, 1:] 87 | yte = df.iloc[:, 0] 88 | self.test = (Xte, yte) 89 | 90 | pmml = path.join(BASE_DIR, '../models/knn-clf-pima.pmml') 91 | self.clf = PMMLKNeighborsClassifier(pmml) 92 | 93 | def test_predict(self): 94 | Xte, yte = self.test 95 | ref = np.array(['Yes','No','Yes','Yes','Yes','Yes','Yes','Yes','No','Yes','Yes','Yes','Yes','Yes','Yes','Yes','No','Yes','Yes','Yes','Yes','Yes','No','Yes','Yes','Yes','Yes','No','No','No','Yes','No','No','No','No','No','No','No','Yes','No','No','No','No','No','No','Yes','No','No','No','No','Yes','Yes']) 96 | assert np.array_equal(ref, np.array(self.clf.predict(Xte))) 97 | 98 | def test_score(self): 99 | Xte, yte = self.test 100 | ref = 0.807692307692307 101 | assert np.allclose(ref, self.clf.score(Xte, yte)) 102 | 103 | def test_fit_exception(self): 104 | with self.assertRaises(Exception) as cm: 105 | self.clf.fit(np.array([[]]), np.array([])) 106 | 107 | assert str(cm.exception) == 'Not supported.' 108 | 109 | def test_more_tags(self): 110 | assert self.clf._more_tags() == {'requires_y': True, **KNeighborsClassifier()._more_tags()} 111 | 112 | def test_sklearn2pmml(self): 113 | X, y = self.test 114 | ref = KNeighborsClassifier(n_neighbors=11) 115 | ref.fit(X, y) 116 | 117 | # Export to PMML 118 | pipeline = PMMLPipeline([ 119 | ("classifier", ref) 120 | ]) 121 | pipeline.fit(self.test[0], self.test[1]) 122 | sklearn2pmml(pipeline, "knn-sklearn2pmml.pmml", with_repr = True) 123 | 124 | try: 125 | # Import PMML 126 | model = PMMLKNeighborsClassifier(pmml='knn-sklearn2pmml.pmml') 127 | 128 | assert np.allclose( 129 | ref.predict_proba(X), 130 | model.predict_proba(X) 131 | ) 132 | 133 | finally: 134 | remove("knn-sklearn2pmml.pmml") 135 | 136 | def test_sklearn2pmml_manhattan(self): 137 | X, y = self.test 138 | ref = KNeighborsClassifier(metric='manhattan', n_neighbors=8) 139 | ref.fit(X, y) 140 | 141 | # Export to PMML 142 | pipeline = PMMLPipeline([ 143 | ("classifier", ref) 144 | ]) 145 | pipeline.fit(self.test[0], self.test[1]) 146 | sklearn2pmml(pipeline, "knn-sklearn2pmml.pmml", with_repr = True) 147 | 148 | try: 149 | # Import PMML 150 | model = PMMLKNeighborsClassifier(pmml='knn-sklearn2pmml.pmml') 151 | 152 | assert np.allclose( 153 | ref.predict_proba(X), 154 | model.predict_proba(X) 155 | ) 156 | 157 | finally: 158 | remove("knn-sklearn2pmml.pmml") 159 | 160 | 161 | class TestKNeighborsRegressorIntegration(TestCase): 162 | def setUp(self): 163 | df = pd.read_csv(path.join(BASE_DIR, '../models/categorical-test.csv')) 164 | cats = np.unique(df['age']) 165 | df['age'] = pd.Categorical(df['age'], categories=cats).codes + 1 166 | Xte = df.iloc[:, 1:] 167 | yte = df.iloc[:, 0] 168 | self.test = (Xte, yte) 169 | 170 | pmml = path.join(BASE_DIR, '../models/knn-reg-pima.pmml') 171 | self.clf = PMMLKNeighborsRegressor(pmml) 172 | 173 | def test_predict(self): 174 | Xte, yte = self.test 175 | ref = np.array([ 176 | 0.7142857, 177 | 0.1428571, 178 | 0.7142857, 179 | 1.0000000, 180 | 0.8571429, 181 | 0.8571429, 182 | 0.7142857, 183 | 0.8571429, 184 | 0.2857143, 185 | 0.8571429, 186 | 0.8571429, 187 | 0.7142857, 188 | 0.8571429, 189 | 1.0000000, 190 | 0.8571429, 191 | 0.7142857, 192 | 0.1428571, 193 | 1.0000000, 194 | 0.5714286, 195 | 0.7142857, 196 | 0.8571429, 197 | 0.5714286, 198 | 0.4285714, 199 | 0.5714286, 200 | 1.0000000, 201 | 0.5714286, 202 | 0.8571429, 203 | 0.1428571, 204 | 0.1428571, 205 | 0.2857143, 206 | 0.7142857, 207 | 0.1428571, 208 | 0.2857143, 209 | 0.0000000, 210 | 0.1428571, 211 | 0.2857143, 212 | 0.1428571, 213 | 0.0000000, 214 | 0.8571429, 215 | 0.4285714, 216 | 0.2857143, 217 | 0.1428571, 218 | 0.1428571, 219 | 0.1428571, 220 | 0.1428571, 221 | 0.7142857, 222 | 0.0000000, 223 | 0.1428571, 224 | 0.2857143, 225 | 0.1428571, 226 | 0.7142857, 227 | 0.7142857, 228 | ]) 229 | assert np.allclose(ref, np.array(self.clf.predict(Xte))) 230 | 231 | def test_score(self): 232 | Xte, yte = self.test 233 | ref = 0.383045525902668 234 | assert np.allclose(ref, self.clf.score(Xte, (yte == 'Yes').astype(int))) 235 | 236 | def test_fit_exception(self): 237 | with self.assertRaises(Exception) as cm: 238 | self.clf.fit(np.array([[]]), np.array([])) 239 | 240 | assert str(cm.exception) == 'Not supported.' 241 | 242 | def test_more_tags(self): 243 | assert self.clf._more_tags() == KNeighborsRegressor()._more_tags() 244 | -------------------------------------------------------------------------------- /tests/test_datatypes.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from sklearn_pmml_model.datatypes import Category, Interval 3 | 4 | 5 | class TestInterval(TestCase): 6 | def test_exception(self): 7 | with self.assertRaises(Exception) as cm: 8 | Interval(closure='openOpen') 9 | assert type(cm.exception) == AssertionError 10 | 11 | with self.assertRaises(Exception) as cm: 12 | Interval('openOpen', 3, 0) 13 | assert type(cm.exception) == AssertionError 14 | 15 | with self.assertRaises(Exception) as cm: 16 | Interval('non_existing_closure', 0) 17 | assert type(cm.exception) == AssertionError 18 | 19 | def test_contains(self): 20 | interval = Interval('closedClosed', 1, 10) 21 | 22 | assert 2 in interval 23 | assert 0 not in interval 24 | assert 10.1 not in interval 25 | 26 | 27 | class TestCategory(TestCase): 28 | def test_exception(self): 29 | with self.assertRaises(Exception) as cm: 30 | Category(str, categories="bad cats") 31 | assert type(cm.exception) == AssertionError 32 | 33 | with self.assertRaises(Exception) as cm: 34 | Category(str, [1, 2], ordered=1) 35 | assert type(cm.exception) == AssertionError 36 | 37 | def test_contains(self): 38 | categories = ['loud', 'louder', 'loudest'] 39 | cat_type = Category(str, categories, ordered=True) 40 | 41 | assert 'loud' in cat_type 42 | assert 'bad' not in cat_type 43 | 44 | def test_callable(self): 45 | categories = ['1', '2', '3'] 46 | cat_type = Category(int, categories, ordered=True) 47 | 48 | with self.assertRaises(Exception) as cm: 49 | cat_type('4') 50 | 51 | assert str(cm.exception) == 'Invalid categorical value: 4' 52 | assert isinstance(cat_type('1'), int) 53 | assert cat_type('2') == 2 54 | --------------------------------------------------------------------------------