├── .coveragerc ├── .coveralls.yml ├── .editorconfig ├── .gitattributes ├── .github └── ISSUE_TEMPLATE.md ├── .gitignore ├── .travis.yml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.rst ├── HISTORY.rst ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── cmd2pkg ├── datasets ├── libsvm │ ├── README.txt │ ├── breast-cancer_scale │ ├── german.numer_scale │ ├── heart_scale │ ├── ionosphere.arff │ └── ionosphere_scale └── simple_comparison_JKMs_Weka.py ├── demo_tutorials └── demo_kernelmethods.ipynb ├── docs ├── API.rst ├── Makefile ├── categorical_kernels.rst ├── conf.py ├── contributing.rst ├── flyer.png ├── graph_kernels.rst ├── history.rst ├── index.rst ├── installation.rst ├── kernel_functions.rst ├── kernel_matrix.rst ├── km_collections.rst ├── logo_kernelmethods.png ├── make.bat ├── numeric_kernels.rst ├── operations.rst ├── readme.rst ├── string_kernels.rst ├── usage.rst └── utilities.rst ├── kernelmethods ├── __init__.py ├── _version.py ├── algorithms.py ├── base.py ├── categorical.py ├── config.py ├── numeric_kernels.py ├── operations.py ├── ranking.py ├── sampling.py ├── tests │ ├── test_algorithms.py │ ├── test_base_classes.py │ ├── test_categorical.py │ ├── test_kernel_matrix.py │ ├── test_kernel_set.py │ ├── test_numeric_kernels.py │ ├── test_operations.py │ ├── test_ranking.py │ ├── test_sampling.py │ └── test_utils.py └── utils.py ├── pytest.ini ├── requirements.txt ├── requirements_dev.txt ├── setup.cfg ├── setup.py ├── tox.ini └── versioneer.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | exclude_lines = 3 | pragma: no cover 4 | def __repr__ 5 | def __str__ 6 | def __format__ 7 | __format__ 8 | __repr__ 9 | __str__ 10 | if contains_nan_inf* 11 | if self.debug: 12 | if settings.DEBUG 13 | raise RuntimeError 14 | raise AssertionError 15 | raise NotImplementedError 16 | if 0: 17 | if __name__ == .__main__.: 18 | 19 | ignore_errors = True 20 | 21 | [run] 22 | omit = 23 | # omit anything in a .local directory anywhere 24 | */.local/* 25 | 26 | # omit everything in /usr 27 | /usr/* 28 | */tests/*.py 29 | 30 | # irrelevant files 31 | kernelmethods/__*__.py 32 | kernelmethods/_version.py 33 | 34 | -------------------------------------------------------------------------------- /.coveralls.yml: -------------------------------------------------------------------------------- 1 | service_name: travis-pro 2 | repo_token: mnWg3PvTHwoOPt7HFxzVqM5gFXwI095KB 3 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | kernelmethods/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | * kernelmethods version: 2 | * Python version: 3 | * Operating System: 4 | 5 | ### Description 6 | 7 | Describe what you were trying to get done. 8 | Tell us what happened, what went wrong, and what you expected to happen. 9 | 10 | ### What I Did 11 | 12 | ``` 13 | Paste the command(s) you ran and the output. 14 | If there was a crash, please include the traceback here. 15 | ``` 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .vscode 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | .venv 91 | venv/ 92 | ENV/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | 2 | # Config file for automatic testing at travis-ci.org 3 | install: 4 | - pip install pytest-cov hypothesis 5 | - pip install -r requirements_dev.txt 6 | - pip install -e . 7 | 8 | language: python 9 | cache: pip 10 | python: 11 | - 3.6 12 | 13 | script: 14 | - pytest --cov kernelmethods --cov-config=.coveragerc 15 | 16 | after_success: 17 | - coveralls 18 | 19 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at raamana@gmail.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Contributing 5 | ============ 6 | 7 | Contributions are welcome, and they are greatly appreciated! Every little bit 8 | helps, and credit will always be given. 9 | 10 | You can contribute in many ways: 11 | 12 | Types of Contributions 13 | ---------------------- 14 | 15 | Report Bugs 16 | ~~~~~~~~~~~ 17 | 18 | Report bugs at https://github.com/raamana/kernelmethods/issues. 19 | 20 | If you are reporting a bug, please include: 21 | 22 | * Your operating system name and version. 23 | * Any details about your local setup that might be helpful in troubleshooting. 24 | * Detailed steps to reproduce the bug. 25 | 26 | Fix Bugs 27 | ~~~~~~~~ 28 | 29 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help 30 | wanted" is open to whoever wants to implement it. 31 | 32 | Implement Features 33 | ~~~~~~~~~~~~~~~~~~ 34 | 35 | Look through the GitHub issues for features. Anything tagged with "enhancement" 36 | and "help wanted" is open to whoever wants to implement it. 37 | 38 | Write Documentation 39 | ~~~~~~~~~~~~~~~~~~~ 40 | 41 | kernelmethods could always use more documentation, whether as part of the 42 | official kernelmethods docs, in docstrings, or even on the web in blog posts, 43 | articles, and such. 44 | 45 | Submit Feedback 46 | ~~~~~~~~~~~~~~~ 47 | 48 | The best way to send feedback is to file an issue at https://github.com/raamana/kernelmethods/issues. 49 | 50 | If you are proposing a feature: 51 | 52 | * Explain in detail how it would work. 53 | * Keep the scope as narrow as possible, to make it easier to implement. 54 | * Remember that this is a volunteer-driven project, and that contributions 55 | are welcome :) 56 | 57 | Get Started! 58 | ------------ 59 | 60 | Ready to contribute? Here's how to set up `kernelmethods` for local development. 61 | 62 | 1. Fork the `kernelmethods` repo on GitHub. 63 | 2. Clone your fork locally:: 64 | 65 | $ git clone git@github.com:your_name_here/kernelmethods.git 66 | 67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: 68 | 69 | $ mkvirtualenv kernelmethods 70 | $ cd kernelmethods/ 71 | $ python setup.py develop 72 | 73 | 4. Create a branch for local development:: 74 | 75 | $ git checkout -b name-of-your-bugfix-or-feature 76 | 77 | Now you can make your changes locally. 78 | 79 | 5. When you're done making changes, check that your changes pass flake8 and the 80 | tests, including testing other Python versions with tox:: 81 | 82 | $ flake8 kernelmethods tests 83 | $ python setup.py test or py.test 84 | $ tox 85 | 86 | To get flake8 and tox, just pip install them into your virtualenv. 87 | 88 | 6. Commit your changes and push your branch to GitHub:: 89 | 90 | $ git add . 91 | $ git commit -m "Your detailed description of your changes." 92 | $ git push origin name-of-your-bugfix-or-feature 93 | 94 | 7. Submit a pull request through the GitHub website. 95 | 96 | Pull Request Guidelines 97 | ----------------------- 98 | 99 | Before you submit a pull request, check that it meets these guidelines: 100 | 101 | 1. The pull request should include tests. 102 | 2. If the pull request adds functionality, the docs should be updated. Put 103 | your new functionality into a function with a docstring, and add the 104 | feature to the list in README.rst. 105 | 3. The pull request should work for Python 2.7, 3.4, 3.5 and 3.6, and for PyPy. Check 106 | https://travis-ci.org/raamana/kernelmethods/pull_requests 107 | and make sure that the tests pass for all supported Python versions. 108 | 109 | Tips 110 | ---- 111 | 112 | To run a subset of tests:: 113 | 114 | $ py.test tests.test_kernelmethods 115 | 116 | 117 | Deploying 118 | --------- 119 | 120 | A reminder for the maintainers on how to deploy. 121 | Make sure all your changes are committed (including an entry in HISTORY.rst). 122 | Then run:: 123 | 124 | $ bumpversion patch # possible: major / minor / patch 125 | $ git push 126 | $ git push --tags 127 | 128 | Travis will then deploy to PyPI if tests pass. 129 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | History 3 | ======= 4 | 5 | 0.2 (2019-08-08) 6 | ------------------ 7 | 8 | * First full release 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache Software License 2.0 2 | 3 | Copyright (c) 2018, Pradeep Reddy Raamana 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include CONTRIBUTING.rst 2 | include HISTORY.rst 3 | include LICENSE 4 | include README.rst 5 | 6 | recursive-include tests * 7 | recursive-exclude * __pycache__ 8 | recursive-exclude * *.py[co] 9 | 10 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif 11 | include versioneer.py 12 | include kernelmethods/_version.py 13 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean clean-test clean-pyc clean-build docs help 2 | .DEFAULT_GOAL := help 3 | 4 | define BROWSER_PYSCRIPT 5 | import os, webbrowser, sys 6 | 7 | try: 8 | from urllib import pathname2url 9 | except: 10 | from urllib.request import pathname2url 11 | 12 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) 13 | endef 14 | export BROWSER_PYSCRIPT 15 | 16 | define PRINT_HELP_PYSCRIPT 17 | import re, sys 18 | 19 | for line in sys.stdin: 20 | match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) 21 | if match: 22 | target, help = match.groups() 23 | print("%-20s %s" % (target, help)) 24 | endef 25 | export PRINT_HELP_PYSCRIPT 26 | 27 | BROWSER := python -c "$$BROWSER_PYSCRIPT" 28 | 29 | help: 30 | @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) 31 | 32 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts 33 | 34 | clean-build: ## remove build artifacts 35 | rm -fr build/ 36 | rm -fr dist/ 37 | rm -fr .eggs/ 38 | find . -name '*.egg-info' -exec rm -fr {} + 39 | find . -name '*.egg' -exec rm -f {} + 40 | 41 | clean-pyc: ## remove Python file artifacts 42 | find . -name '*.pyc' -exec rm -f {} + 43 | find . -name '*.pyo' -exec rm -f {} + 44 | find . -name '*~' -exec rm -f {} + 45 | find . -name '__pycache__' -exec rm -fr {} + 46 | 47 | clean-test: ## remove test and coverage artifacts 48 | rm -fr .tox/ 49 | rm -f .coverage 50 | rm -fr htmlcov/ 51 | rm -fr .pytest_cache 52 | 53 | lint: ## check style with flake8 54 | flake8 kernelmethods tests 55 | 56 | test: ## run tests quickly with the default Python 57 | py.test 58 | 59 | test-all: ## run tests on every Python version with tox 60 | tox 61 | 62 | coverage: ## check code coverage quickly with the default Python 63 | coverage run --source kernelmethods -m pytest 64 | coverage report -m 65 | coverage html 66 | $(BROWSER) htmlcov/index.html 67 | 68 | docs: ## generate Sphinx HTML documentation, including API docs 69 | rm -f docs/kernelmethods.rst 70 | rm -f docs/modules.rst 71 | sphinx-apidoc -o docs/ kernelmethods 72 | $(MAKE) -C docs clean 73 | $(MAKE) -C docs html 74 | $(BROWSER) docs/_build/html/index.html 75 | 76 | servedocs: docs ## compile the docs watching for changes 77 | watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . 78 | 79 | release: dist ## package and upload a release 80 | twine upload dist/* 81 | 82 | dist: clean ## builds source and wheel package 83 | python setup.py sdist 84 | python setup.py bdist_wheel 85 | ls -l dist 86 | 87 | install: clean ## install the package to the active Python's site-packages 88 | python setup.py install 89 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ========================== 2 | Kernel methods and classes 3 | ========================== 4 | 5 | .. image:: docs/logo_kernelmethods.png 6 | :height: 150 7 | 8 | .. image:: https://img.shields.io/pypi/v/kernelmethods.svg 9 | :target: https://pypi.python.org/pypi/kernelmethods 10 | 11 | .. image:: https://img.shields.io/travis/raamana/kernelmethods.svg 12 | :target: https://travis-ci.org/raamana/kernelmethods 13 | 14 | .. image:: https://coveralls.io/repos/github/raamana/kernelmethods/badge.svg?branch=master 15 | :target: https://coveralls.io/github/raamana/kernelmethods?branch=master 16 | 17 | 18 | Documentation 19 | --------------- 20 | 21 | API and usage: https://raamana.github.io/kernelmethods/ 22 | 23 | 24 | Demo notebooks (no setup required, click on the binder logo) : 25 | 26 | .. image:: https://mybinder.org/badge_logo.svg 27 | :target: https://mybinder.org/v2/gh/raamana/kernelmethods/master?filepath=demo_tutorials%2Fdemo_kernelmethods.ipynb 28 | 29 | Arxiv preprint 30 | --------------- 31 | 32 | A paper presenting the design and some validation is available at https://arxiv.org/abs/2005.13483 33 | 34 | 35 | News 36 | ------ 37 | 38 | - Hadamard kernel is now available (which showed potential in some bioinformatics applications like breast cancer detection). 39 | 40 | 41 | Description 42 | ------------- 43 | 44 | 45 | ``kernelmethods`` is a pure python library defining modular classes that provides basic kernel methods as well as an intuitive interface for advanced functionality such as composite and hyper kernels. This library fills an important void in the ever-growing python-based machine learning ecosystem, where users can only use predefined kernels and are not able to customize or extend them for their own applications, which requires great flexibility owing to their diversity and need for better performing kernel. 46 | 47 | *schmeatic*: 48 | 49 | .. image:: docs/flyer.png 50 | 51 | This library defines the ``KernelMatrix`` class that is central to all the kernel methods and machines. As the ``KernelMatrix`` class is a key bridge between input data and the various kernel learning algorithms, it is designed to be highly usable and extensible to different applications and data types. Besides being able to apply basic kernels on a given sample (to produce a ``KernelMatrix``), this library provides various kernel operations, such as normalization, centering, product, alignment evaluation, linear combination and ranking (by various performance metrics) of kernel matrices. 52 | 53 | In addition, we provide several convenient classes, such as ``KernelSet`` and ``KernelBucket`` for easy management of a large collection of kernels. Dealing with a diverse configuration of kernels is necessary for automatic kernel selection and optimization in applications such as Multiple Kernel Learning (MKL) and the like. 54 | 55 | In addition to the common numerical kernels such as the Gaussian and Polynomial kernels, we designed this library to make it easy to develop categorical, string and graph kernels, with the same attractive properties of intuitive and highly-testable API. In addition to providing native implementation of non-numerical kernels, we aim to provide a deeply and easily extensible framework for arbitrary input data types, such as sequences, trees and graphs etc, via data structures such as ``pyradigm``. 56 | 57 | Moreover, drop-in ``Estimator`` classes are provided, called ``KernelMachine``, offering the power of ``SVM`` for seamless usage in the ``scikit-learn`` ecosystem. Another useful class is called ``OptimalKernelSVR`` which finds the most optimal kernel func for a given sample, and trains the SVM using the optimal kernel. 58 | 59 | 60 | Docs 61 | ---- 62 | 63 | API and Usage: https://raamana.github.io/kernelmethods/ 64 | 65 | Demo notebook: `on binder `_. 66 | 67 | A paper presenting the design and some validation is available `here `_ 68 | 69 | Note 70 | ---- 71 | 72 | The software is beta. All types of contributions are greatly welcome. 73 | 74 | 75 | Dedication 76 | ----------- 77 | 78 | This library is dedicated to `The Concert for Bangladesh `_, George Harrison and Pandit Ravi Shankar, who moved me immensely with their empathy and kindness, by organizing the first benefit concert ever to raise international awareness and funds for Bangladesh's liberation war in 1971. 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /cmd2pkg: -------------------------------------------------------------------------------- 1 | 2 | rm -rf dist build kernelmethods.egg-info 3 | 4 | python setup.py sdist bdist_wheel 5 | 6 | twine upload dist/* 7 | 8 | rm -rf dist build kernelmethods.egg-info 9 | 10 | -------------------------------------------------------------------------------- /datasets/libsvm/README.txt: -------------------------------------------------------------------------------- 1 | 2 | Datasets are copied from JKernelMachine's repo at 3 | 4 | https://github.com/davidpicard/jkernelmachines/tree/master/resources 5 | 6 | for the purpose of comparison to experiments in their paper: JKernelMachines and Weka. 7 | 8 | -------------------------------------------------------------------------------- /datasets/simple_comparison_JKMs_Weka.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This is a simple comparison of kernelmethods to JKernelMachines and Weka. 4 | 5 | Repeated holdout (80% train, 20% test) with 20 repetitions, on four UCI datasets 6 | 7 | """ 8 | 9 | from os.path import abspath, dirname, join as pjoin 10 | from time import gmtime, strftime 11 | from warnings import simplefilter 12 | 13 | import numpy as np 14 | from sklearn.datasets.svmlight_format import load_svmlight_file 15 | from sklearn.model_selection import ShuffleSplit, cross_val_score 16 | from sklearn.svm import SVC 17 | 18 | from kernelmethods.algorithms import KernelMachine 19 | from kernelmethods.numeric_kernels import GaussianKernel 20 | from kernelmethods.utils import _ensure_min_eps 21 | 22 | simplefilter('ignore') 23 | 24 | ds_dir = dirname(abspath(__file__)) 25 | ds_names = ( 26 | "ionosphere_scale", 27 | "heart_scale", 28 | "breast-cancer_scale", 29 | "german.numer_scale",) 30 | 31 | ds_paths = [pjoin(ds_dir, 'libsvm', name) for name in ds_names] 32 | 33 | 34 | def sigma_from_gamma(gamma=0.1): 35 | return _ensure_min_eps(np.sqrt(1.0 / (2 * gamma))) 36 | 37 | 38 | def gamma_from_sigma(sigma=0.1): 39 | return _ensure_min_eps(1.0 / (2 * sigma ** 2)) 40 | 41 | 42 | for name, ds_path in zip(ds_names, ds_paths): 43 | time_stamp = strftime("%H:%M:%S", gmtime()) 44 | 45 | X, y = load_svmlight_file(ds_path) 46 | X = X.toarray() 47 | 48 | print('\n{:10} {:20} {}'.format(time_stamp, name, X.shape)) 49 | 50 | gamma = 0.1 51 | skl_svm = SVC(C=1.0, kernel='rbf', gamma=gamma) 52 | ss_cv1 = ShuffleSplit(n_splits=20, train_size=0.8, test_size=0.2) 53 | scores_skl = cross_val_score(skl_svm, X, y, cv=ss_cv1) 54 | 55 | ker_func = GaussianKernel(sigma=sigma_from_gamma(gamma)) 56 | km_svm = KernelMachine(k_func=ker_func, learner_id='SVM', normalized=False) 57 | ss_cv2 = ShuffleSplit(n_splits=20, train_size=0.8, test_size=0.2) 58 | scores_km = cross_val_score(km_svm, X, y, cv=ss_cv2) 59 | 60 | print('\tSKLearn Accuracy: {:.4f} +/- {:.4f}' 61 | ''.format(np.mean(scores_skl), np.std(scores_skl))) 62 | 63 | print('\tKM SVM Accuracy: {:.4f} +/- {:.4f}' 64 | ''.format(np.mean(scores_km), np.std(scores_km))) 65 | -------------------------------------------------------------------------------- /docs/API.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | -------------- 3 | 4 | A tutorial-like presentation is available at :doc:`usage`. 5 | 6 | This library consists of a set of key classes such as ``KernelMatrix`` documented in :doc:`kernel_matrix`, diverse library of :doc:`kernel_functions`, ``KernelSet`` and ``KernelBucket`` described in :doc:`km_collections`, along with a library of :doc:`operations` and related :doc:`utilities`. 7 | 8 | 9 | Exceptions 10 | ========== 11 | 12 | .. autoclass:: kernelmethods.KernelMethodsException 13 | :undoc-members: 14 | :inherited-members: 15 | :show-inheritance: 16 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = kernelmethods 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/categorical_kernels.rst: -------------------------------------------------------------------------------- 1 | Categorical kernels 2 | ------------------- 3 | 4 | Implemented: 5 | 6 | - ``MatchCountKernel`` 7 | 8 | 9 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # kernelmethods documentation build configuration file, created by 5 | # sphinx-quickstart on Fri Jun 9 13:47:02 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another 17 | # directory, add these directories to sys.path here. If the directory is 18 | # relative to the documentation root, use os.path.abspath to make it 19 | # absolute, like shown here. 20 | # 21 | import os 22 | import sys 23 | 24 | # Get the project root dir, which is the parent dir of this 25 | cwd = os.getcwd() 26 | project_root = os.path.dirname(cwd) 27 | 28 | sys.path.insert(0, os.path.abspath('..')) 29 | sys.path.insert(0, project_root) 30 | 31 | sys.path.insert(0, os.path.abspath('../visualqc')) 32 | sys.path.insert(0, os.path.abspath('../../visualqc')) 33 | sys.path.insert(0, os.path.abspath('visualqc')) 34 | 35 | import kernelmethods 36 | import sphinx_rtd_theme 37 | 38 | # -- General configuration --------------------------------------------- 39 | 40 | # If your documentation needs a minimal Sphinx version, state it here. 41 | # 42 | # needs_sphinx = '1.0' 43 | 44 | # Add any Sphinx extension module names here, as strings. They can be 45 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 46 | extensions = ['sphinx.ext.autodoc', 47 | 'sphinx.ext.intersphinx', 48 | 'sphinx.ext.mathjax', 49 | 'sphinx.ext.viewcode', 50 | 'sphinx.ext.githubpages', 51 | 'numpydoc', 52 | 'sphinxarg.ext', 53 | 'sphinx.ext.intersphinx'] 54 | 55 | # Add any paths that contain templates here, relative to this directory. 56 | templates_path = ['_templates'] 57 | 58 | # The suffix(es) of source filenames. 59 | # You can specify multiple suffix as a list of string: 60 | # 61 | # source_suffix = ['.rst', '.md'] 62 | source_suffix = '.rst' 63 | 64 | # The master toctree document. 65 | master_doc = 'index' 66 | 67 | # General information about the project. 68 | project = u'kernelmethods' 69 | copyright = u"2018, Pradeep Reddy Raamana" 70 | author = u"Pradeep Reddy Raamana" 71 | 72 | # The version info for the project you're documenting, acts as replacement 73 | # for |version| and |release|, also used in various other places throughout 74 | # the built documents. 75 | # 76 | # The short X.Y version. 77 | version = kernelmethods.__version__ 78 | # The full version, including alpha/beta/rc tags. 79 | release = kernelmethods.__version__ 80 | 81 | # The language for content autogenerated by Sphinx. Refer to documentation 82 | # for a list of supported languages. 83 | # 84 | # This is also used if you do content translation via gettext catalogs. 85 | # Usually you set "language" from the command line for these cases. 86 | language = None 87 | 88 | # List of patterns, relative to source directory, that match files and 89 | # directories to ignore when looking for source files. 90 | # This patterns also effect to html_static_path and html_extra_path 91 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 92 | 93 | # The name of the Pygments (syntax highlighting) style to use. 94 | pygments_style = 'sphinx' 95 | 96 | # If true, `todo` and `todoList` produce output, else they produce nothing. 97 | todo_include_todos = False 98 | 99 | 100 | # -- Options for HTML output ------------------------------------------- 101 | 102 | # The theme to use for HTML and HTML Help pages. See the documentation for 103 | # a list of builtin themes. 104 | # 105 | # html_theme = 'alabaster' 106 | html_theme = "sphinx_rtd_theme" 107 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 108 | 109 | # Theme options are theme-specific and customize the look and feel of a 110 | # theme further. For a list of options available for each theme, see the 111 | # documentation. 112 | # 113 | # html_theme_options = {} 114 | 115 | # Add any paths that contain custom static files (such as style sheets) here, 116 | # relative to this directory. They are copied after the builtin static files, 117 | # so a file named "default.css" will overwrite the builtin "default.css". 118 | html_static_path = ['_static'] 119 | 120 | 121 | # -- Options for HTMLHelp output --------------------------------------- 122 | 123 | # Output file base name for HTML help builder. 124 | htmlhelp_basename = 'kernelmethodsdoc' 125 | 126 | 127 | # -- Options for LaTeX output ------------------------------------------ 128 | 129 | latex_elements = { 130 | # The paper size ('letterpaper' or 'a4paper'). 131 | # 132 | # 'papersize': 'letterpaper', 133 | 134 | # The font size ('10pt', '11pt' or '12pt'). 135 | # 136 | # 'pointsize': '10pt', 137 | 138 | # Additional stuff for the LaTeX preamble. 139 | # 140 | # 'preamble': '', 141 | 142 | # Latex figure (float) alignment 143 | # 144 | # 'figure_align': 'htbp', 145 | } 146 | 147 | # Grouping the document tree into LaTeX files. List of tuples 148 | # (source start file, target name, title, author, documentclass 149 | # [howto, manual, or own class]). 150 | latex_documents = [ 151 | (master_doc, 'kernelmethods.tex', 152 | u'kernelmethods Documentation', 153 | u'Pradeep Reddy Raamana', 'manual'), 154 | ] 155 | 156 | 157 | # -- Options for manual page output ------------------------------------ 158 | 159 | # One entry per manual page. List of tuples 160 | # (source start file, name, description, authors, manual section). 161 | man_pages = [ 162 | (master_doc, 'kernelmethods', 163 | u'kernelmethods Documentation', 164 | [author], 1) 165 | ] 166 | 167 | 168 | # -- Options for Texinfo output ---------------------------------------- 169 | 170 | # Grouping the document tree into Texinfo files. List of tuples 171 | # (source start file, target name, title, author, 172 | # dir menu entry, description, category) 173 | texinfo_documents = [ 174 | (master_doc, 'kernelmethods', 175 | u'kernelmethods Documentation', 176 | author, 177 | 'kernelmethods', 178 | 'One line description of project.', 179 | 'Miscellaneous'), 180 | ] 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CONTRIBUTING.rst 2 | -------------------------------------------------------------------------------- /docs/flyer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raamana/kernelmethods/5497b572edc588027f9498d873afca0763d8e8e7/docs/flyer.png -------------------------------------------------------------------------------- /docs/graph_kernels.rst: -------------------------------------------------------------------------------- 1 | Graph kernels (coming soon) 2 | ---------------------------- 3 | 4 | -------------------------------------------------------------------------------- /docs/history.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../HISTORY.rst 2 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to kernelmethods's documentation! 2 | ========================================= 3 | 4 | .. toctree:: 5 | :maxdepth: 3 6 | :caption: Contents: 7 | 8 | readme 9 | installation 10 | usage 11 | API 12 | kernel_matrix 13 | kernel_functions 14 | km_collections 15 | operations 16 | utilities 17 | numeric_kernels 18 | categorical_kernels 19 | string_kernels 20 | graph_kernels 21 | contributing 22 | history 23 | 24 | Indices and tables 25 | ================== 26 | * :ref:`genindex` 27 | * :ref:`modindex` 28 | * :ref:`search` 29 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Installation 5 | ============ 6 | 7 | 8 | Stable release 9 | -------------- 10 | 11 | To install kernelmethods, run this command in your terminal: 12 | 13 | .. code-block:: console 14 | 15 | $ pip install kernelmethods 16 | 17 | This is the preferred method to install kernelmethods, as it will always install the most recent stable release. 18 | 19 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide 20 | you through the process. 21 | 22 | .. _pip: https://pip.pypa.io 23 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ 24 | 25 | 26 | From sources 27 | ------------ 28 | 29 | The sources for kernelmethods can be downloaded from the `Github repo`_. 30 | 31 | You can either clone the public repository: 32 | 33 | .. code-block:: console 34 | 35 | $ git clone git://github.com/raamana/kernelmethods 36 | 37 | Or download the `tarball`_: 38 | 39 | .. code-block:: console 40 | 41 | $ curl -OL https://github.com/raamana/kernelmethods/tarball/master 42 | 43 | Once you have a copy of the source, you can install it with: 44 | 45 | .. code-block:: console 46 | 47 | $ python setup.py install 48 | 49 | 50 | .. _Github repo: https://github.com/raamana/kernelmethods 51 | .. _tarball: https://github.com/raamana/kernelmethods/tarball/master 52 | -------------------------------------------------------------------------------- /docs/kernel_functions.rst: -------------------------------------------------------------------------------- 1 | Kernel functions 2 | ---------------- 3 | 4 | Kernel functions are the key to producing kernel matrices and hence are the backbone of kernel methods and machines. These are represented by a fundamental [abstract base] class called ``BaseKernelFunction``, which defines several desirable properties, such as making it callable, easy way to check if it induces a positive semi-definite as well as a readable representation of the underlying function. 5 | 6 | We also provide a ``KernelFromCallable`` class which makes it even easier to define a kernel function just by specifying the underlying function, without having to define a fully separate class. 7 | 8 | In addition, the following classes are provided to enable compositional represenation of multiple kernel functions for advanced applications: ``CompositeKernel``, ``ProductKernel``, ``SumKernel``, ``AverageKernel``, and ``WeightedAverageKernel``. 9 | 10 | 11 | ``kernelmethods`` offers kernel functions that can operate on the following data types: 12 | 13 | - :doc:`numeric_kernels` 14 | - :doc:`categorical_kernels` 15 | - :doc:`string_kernels` 16 | - :doc:`graph_kernels` 17 | - and others such as trees and sequences (TBA). 18 | 19 | .. automodule:: kernelmethods 20 | :members: BaseKernelFunction, KernelFromCallable 21 | :undoc-members: 22 | :inherited-members: 23 | :show-inheritance: 24 | 25 | 26 | Composite kernel functions 27 | --------------------------- 28 | 29 | 30 | .. automodule:: kernelmethods 31 | :members: CompositeKernel, ProductKernel, SumKernel, AverageKernel, WeightedAverageKernel 32 | :undoc-members: 33 | :inherited-members: 34 | :show-inheritance: 35 | 36 | -------------------------------------------------------------------------------- /docs/kernel_matrix.rst: -------------------------------------------------------------------------------- 1 | KernelMatrix class 2 | ------------------ 3 | 4 | ``KernelMatrix`` is a self-contained class for the Gram matrix induced by a kernel function on a given sample. This class defines the central data structure for all kernel methods, as it acts a key bridge between input data space and the learning algorithms. 5 | 6 | The class is designed in such a way that 7 | 8 | - it only computes elements of the kernel matrix (KM) as neeeded, and nothing more, which can save a lot computation and storage 9 | - it supports both callable as well as attribute access, allowing easy access to partial or random portions of the KM. Indexing is aimed to be compliant with numpy as much as possible. 10 | - allows parallel computation of different part of the KM to speed up computation when ``N`` is large 11 | - allows setting of user-defined attributes to allow easy identification and differentiation among a collection of KMs when working in applications such as Multiple Kernel Learning (MKL) 12 | - implements basic operations such as centering and normalization (whose implementation differs from that of manipulating regular matrices) 13 | - exposes several convenience attributes to make advanced development a breeze 14 | 15 | This library also provides convenience wrappers: 16 | 17 | - ``KernelMatrixPrecomputed`` turns a precomputed kernel matrix into a ``KernelMatrix`` class with all its attractive properties 18 | - ``ConstantKernelMatrix`` that defines a ``KernelMatrix`` with a constant everywhere 19 | 20 | 21 | .. autoclass:: kernelmethods.KernelMatrix 22 | :members: 23 | :undoc-members: 24 | 25 | 26 | Exceptions 27 | ========== 28 | 29 | .. autoclass:: kernelmethods.KMAccessError 30 | :undoc-members: 31 | :inherited-members: 32 | :show-inheritance: 33 | 34 | -------------------------------------------------------------------------------- /docs/km_collections.rst: -------------------------------------------------------------------------------- 1 | Collection of kernel matrices 2 | ----------------------------- 3 | 4 | 5 | Kernel Set 6 | ============== 7 | 8 | .. autoclass:: kernelmethods.KernelSet 9 | :undoc-members: 10 | :inherited-members: 11 | :show-inheritance: 12 | 13 | 14 | Kernel Bucket 15 | ============== 16 | 17 | .. autoclass:: kernelmethods.KernelBucket 18 | :undoc-members: 19 | :inherited-members: 20 | :show-inheritance: 21 | 22 | 23 | Exceptions 24 | ========== 25 | 26 | .. autoclass:: kernelmethods.KMSetAdditionError 27 | :undoc-members: 28 | :inherited-members: 29 | :show-inheritance: 30 | 31 | -------------------------------------------------------------------------------- /docs/logo_kernelmethods.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raamana/kernelmethods/5497b572edc588027f9498d873afca0763d8e8e7/docs/logo_kernelmethods.png -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=kernelmethods 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/numeric_kernels.rst: -------------------------------------------------------------------------------- 1 | Numeric kernels 2 | ----------------- 3 | 4 | 5 | .. automodule:: kernelmethods.numeric_kernels 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: 9 | 10 | -------------------------------------------------------------------------------- /docs/operations.rst: -------------------------------------------------------------------------------- 1 | Kernel Operations 2 | ------------------------ 3 | 4 | 5 | 6 | .. automodule:: kernelmethods.operations 7 | :members: 8 | :undoc-members: 9 | 10 | -------------------------------------------------------------------------------- /docs/readme.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | -------------------------------------------------------------------------------- /docs/string_kernels.rst: -------------------------------------------------------------------------------- 1 | String kernels (coming soon) 2 | ----------------------------- 3 | 4 | -------------------------------------------------------------------------------- /docs/usage.rst: -------------------------------------------------------------------------------- 1 | ===== 2 | Usage 3 | ===== 4 | 5 | The demo notebooks are available at 6 | 7 | Demo notebooks: 8 | 9 | .. image:: https://mybinder.org/badge_logo.svg 10 | :target: https://mybinder.org/v2/gh/raamana/kernelmethods/master?filepath=demo_tutorials%2Fdemo_kernelmethods.ipynb 11 | 12 | 13 | You can also directly get them code [repo](https://github.com/raamana/kernelmethods/tree/master/demo_tutorials) 14 | -------------------------------------------------------------------------------- /docs/utilities.rst: -------------------------------------------------------------------------------- 1 | Utilities 2 | ---------- 3 | 4 | Here, we document several important utilities related to this library. 5 | 6 | .. automodule:: kernelmethods.ranking 7 | :members: 8 | :undoc-members: 9 | :show-inheritance: 10 | 11 | -------------------------------------------------------------------------------- /kernelmethods/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Top-level package for kernelmethods.""" 4 | 5 | __all__ = ['KernelMatrix', 6 | 'BaseKernelFunction', 7 | 'KernelMethodsException', 'KMAccessError', 'KMNormError', 8 | 'KMSetAdditionError', 9 | 'PolyKernel', 'GaussianKernel', 'LaplacianKernel', 'LinearKernel', 10 | 'Chi2Kernel', 'SigmoidKernel', 'HadamardKernel', 11 | 'KernelBucket', 'KernelSet', 12 | 'KernelMachine', 'OptimalKernelSVC', 'OptimalKernelSVR', ] 13 | 14 | from kernelmethods.algorithms import (KernelMachine, KernelMachineRegressor, 15 | OptimalKernelSVC, OptimalKernelSVR) 16 | from kernelmethods.base import BaseKernelFunction, KernelMatrix, KernelSet 17 | from kernelmethods.config import (KMAccessError, KMNormError, KMSetAdditionError, 18 | KernelMethodsException) 19 | from kernelmethods.numeric_kernels import (Chi2Kernel, GaussianKernel, 20 | LaplacianKernel, LinearKernel, PolyKernel, 21 | SigmoidKernel, HadamardKernel) 22 | from kernelmethods.sampling import KernelBucket 23 | from ._version import get_versions 24 | 25 | __version__ = get_versions()['version'] 26 | del get_versions 27 | 28 | __author__ = """Pradeep Reddy Raamana""" 29 | __email__ = 'raamana@gmail.com' 30 | -------------------------------------------------------------------------------- /kernelmethods/_version.py: -------------------------------------------------------------------------------- 1 | 2 | # This file helps to compute a version number in source trees obtained from 3 | # git-archive tarball (such as those provided by githubs download-from-tag 4 | # feature). Distribution tarballs (built by setup.py sdist) and build 5 | # directories (produced by setup.py build) will contain a much shorter file 6 | # that just contains the computed version number. 7 | 8 | # This file is released into the public domain. Generated by 9 | # versioneer-0.18 (https://github.com/warner/python-versioneer) 10 | 11 | """Git implementation of _version.py.""" 12 | 13 | import errno 14 | import os 15 | import re 16 | import subprocess 17 | import sys 18 | 19 | 20 | def get_keywords(): 21 | """Get the keywords needed to look up the version information.""" 22 | # these strings will be replaced by git during git-archive. 23 | # setup.py/versioneer.py will grep for the variable names, so they must 24 | # each be defined on a line of their own. _version.py will just call 25 | # get_keywords(). 26 | git_refnames = " (HEAD -> master)" 27 | git_full = "5497b572edc588027f9498d873afca0763d8e8e7" 28 | git_date = "2023-02-07 09:05:32 -0500" 29 | keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} 30 | return keywords 31 | 32 | 33 | class VersioneerConfig: 34 | """Container for Versioneer configuration parameters.""" 35 | 36 | 37 | def get_config(): 38 | """Create, populate and return the VersioneerConfig() object.""" 39 | # these strings are filled in when 'setup.py versioneer' creates 40 | # _version.py 41 | cfg = VersioneerConfig() 42 | cfg.VCS = "git" 43 | cfg.style = "pep440" 44 | cfg.tag_prefix = "" 45 | cfg.parentdir_prefix = "kernelmethods-" 46 | cfg.versionfile_source = "kernelmethods/_version.py" 47 | cfg.verbose = False 48 | return cfg 49 | 50 | 51 | class NotThisMethod(Exception): 52 | """Exception raised if a method is not valid for the current scenario.""" 53 | 54 | 55 | LONG_VERSION_PY = {} 56 | HANDLERS = {} 57 | 58 | 59 | def register_vcs_handler(vcs, method): # decorator 60 | """Decorator to mark a method as the handler for a particular VCS.""" 61 | def decorate(f): 62 | """Store f in HANDLERS[vcs][method].""" 63 | if vcs not in HANDLERS: 64 | HANDLERS[vcs] = {} 65 | HANDLERS[vcs][method] = f 66 | return f 67 | return decorate 68 | 69 | 70 | def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, 71 | env=None): 72 | """Call the given command(s).""" 73 | assert isinstance(commands, list) 74 | p = None 75 | for c in commands: 76 | try: 77 | dispcmd = str([c] + args) 78 | # remember shell=False, so use git.cmd on windows, not just git 79 | p = subprocess.Popen([c] + args, cwd=cwd, env=env, 80 | stdout=subprocess.PIPE, 81 | stderr=(subprocess.PIPE if hide_stderr 82 | else None)) 83 | break 84 | except EnvironmentError: 85 | e = sys.exc_info()[1] 86 | if e.errno == errno.ENOENT: 87 | continue 88 | if verbose: 89 | print("unable to run %s" % dispcmd) 90 | print(e) 91 | return None, None 92 | else: 93 | if verbose: 94 | print("unable to find command, tried %s" % (commands,)) 95 | return None, None 96 | stdout = p.communicate()[0].strip() 97 | if sys.version_info[0] >= 3: 98 | stdout = stdout.decode() 99 | if p.returncode != 0: 100 | if verbose: 101 | print("unable to run %s (error)" % dispcmd) 102 | print("stdout was %s" % stdout) 103 | return None, p.returncode 104 | return stdout, p.returncode 105 | 106 | 107 | def versions_from_parentdir(parentdir_prefix, root, verbose): 108 | """Try to determine the version from the parent directory name. 109 | 110 | Source tarballs conventionally unpack into a directory that includes both 111 | the project name and a version string. We will also support searching up 112 | two directory levels for an appropriately named parent directory 113 | """ 114 | rootdirs = [] 115 | 116 | for i in range(3): 117 | dirname = os.path.basename(root) 118 | if dirname.startswith(parentdir_prefix): 119 | return {"version": dirname[len(parentdir_prefix):], 120 | "full-revisionid": None, 121 | "dirty": False, "error": None, "date": None} 122 | else: 123 | rootdirs.append(root) 124 | root = os.path.dirname(root) # up a level 125 | 126 | if verbose: 127 | print("Tried directories %s but none started with prefix %s" % 128 | (str(rootdirs), parentdir_prefix)) 129 | raise NotThisMethod("rootdir doesn't start with parentdir_prefix") 130 | 131 | 132 | @register_vcs_handler("git", "get_keywords") 133 | def git_get_keywords(versionfile_abs): 134 | """Extract version information from the given file.""" 135 | # the code embedded in _version.py can just fetch the value of these 136 | # keywords. When used from setup.py, we don't want to import _version.py, 137 | # so we do it with a regexp instead. This function is not used from 138 | # _version.py. 139 | keywords = {} 140 | try: 141 | f = open(versionfile_abs, "r") 142 | for line in f.readlines(): 143 | if line.strip().startswith("git_refnames ="): 144 | mo = re.search(r'=\s*"(.*)"', line) 145 | if mo: 146 | keywords["refnames"] = mo.group(1) 147 | if line.strip().startswith("git_full ="): 148 | mo = re.search(r'=\s*"(.*)"', line) 149 | if mo: 150 | keywords["full"] = mo.group(1) 151 | if line.strip().startswith("git_date ="): 152 | mo = re.search(r'=\s*"(.*)"', line) 153 | if mo: 154 | keywords["date"] = mo.group(1) 155 | f.close() 156 | except EnvironmentError: 157 | pass 158 | return keywords 159 | 160 | 161 | @register_vcs_handler("git", "keywords") 162 | def git_versions_from_keywords(keywords, tag_prefix, verbose): 163 | """Get version information from git keywords.""" 164 | if not keywords: 165 | raise NotThisMethod("no keywords at all, weird") 166 | date = keywords.get("date") 167 | if date is not None: 168 | # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant 169 | # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 170 | # -like" string, which we must then edit to make compliant), because 171 | # it's been around since git-1.5.3, and it's too difficult to 172 | # discover which version we're using, or to work around using an 173 | # older one. 174 | date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) 175 | refnames = keywords["refnames"].strip() 176 | if refnames.startswith("$Format"): 177 | if verbose: 178 | print("keywords are unexpanded, not using") 179 | raise NotThisMethod("unexpanded keywords, not a git-archive tarball") 180 | refs = set([r.strip() for r in refnames.strip("()").split(",")]) 181 | # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of 182 | # just "foo-1.0". If we see a "tag: " prefix, prefer those. 183 | TAG = "tag: " 184 | tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) 185 | if not tags: 186 | # Either we're using git < 1.8.3, or there really are no tags. We use 187 | # a heuristic: assume all version tags have a digit. The old git %d 188 | # expansion behaves like git log --decorate=short and strips out the 189 | # refs/heads/ and refs/tags/ prefixes that would let us distinguish 190 | # between branches and tags. By ignoring refnames without digits, we 191 | # filter out many common branch names like "release" and 192 | # "stabilization", as well as "HEAD" and "master". 193 | tags = set([r for r in refs if re.search(r'\d', r)]) 194 | if verbose: 195 | print("discarding '%s', no digits" % ",".join(refs - tags)) 196 | if verbose: 197 | print("likely tags: %s" % ",".join(sorted(tags))) 198 | for ref in sorted(tags): 199 | # sorting will prefer e.g. "2.0" over "2.0rc1" 200 | if ref.startswith(tag_prefix): 201 | r = ref[len(tag_prefix):] 202 | if verbose: 203 | print("picking %s" % r) 204 | return {"version": r, 205 | "full-revisionid": keywords["full"].strip(), 206 | "dirty": False, "error": None, 207 | "date": date} 208 | # no suitable tags, so version is "0+unknown", but full hex is still there 209 | if verbose: 210 | print("no suitable tags, using unknown + full revision id") 211 | return {"version": "0+unknown", 212 | "full-revisionid": keywords["full"].strip(), 213 | "dirty": False, "error": "no suitable tags", "date": None} 214 | 215 | 216 | @register_vcs_handler("git", "pieces_from_vcs") 217 | def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): 218 | """Get version from 'git describe' in the root of the source tree. 219 | 220 | This only gets called if the git-archive 'subst' keywords were *not* 221 | expanded, and _version.py hasn't already been rewritten with a short 222 | version string, meaning we're inside a checked out source tree. 223 | """ 224 | GITS = ["git"] 225 | if sys.platform == "win32": 226 | GITS = ["git.cmd", "git.exe"] 227 | 228 | out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, 229 | hide_stderr=True) 230 | if rc != 0: 231 | if verbose: 232 | print("Directory %s not under git control" % root) 233 | raise NotThisMethod("'git rev-parse --git-dir' returned error") 234 | 235 | # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] 236 | # if there isn't one, this yields HEX[-dirty] (no NUM) 237 | describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", 238 | "--always", "--long", 239 | "--match", "%s*" % tag_prefix], 240 | cwd=root) 241 | # --long was added in git-1.5.5 242 | if describe_out is None: 243 | raise NotThisMethod("'git describe' failed") 244 | describe_out = describe_out.strip() 245 | full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) 246 | if full_out is None: 247 | raise NotThisMethod("'git rev-parse' failed") 248 | full_out = full_out.strip() 249 | 250 | pieces = {} 251 | pieces["long"] = full_out 252 | pieces["short"] = full_out[:7] # maybe improved later 253 | pieces["error"] = None 254 | 255 | # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] 256 | # TAG might have hyphens. 257 | git_describe = describe_out 258 | 259 | # look for -dirty suffix 260 | dirty = git_describe.endswith("-dirty") 261 | pieces["dirty"] = dirty 262 | if dirty: 263 | git_describe = git_describe[:git_describe.rindex("-dirty")] 264 | 265 | # now we have TAG-NUM-gHEX or HEX 266 | 267 | if "-" in git_describe: 268 | # TAG-NUM-gHEX 269 | mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) 270 | if not mo: 271 | # unparseable. Maybe git-describe is misbehaving? 272 | pieces["error"] = ("unable to parse git-describe output: '%s'" 273 | % describe_out) 274 | return pieces 275 | 276 | # tag 277 | full_tag = mo.group(1) 278 | if not full_tag.startswith(tag_prefix): 279 | if verbose: 280 | fmt = "tag '%s' doesn't start with prefix '%s'" 281 | print(fmt % (full_tag, tag_prefix)) 282 | pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" 283 | % (full_tag, tag_prefix)) 284 | return pieces 285 | pieces["closest-tag"] = full_tag[len(tag_prefix):] 286 | 287 | # distance: number of commits since tag 288 | pieces["distance"] = int(mo.group(2)) 289 | 290 | # commit: short hex revision ID 291 | pieces["short"] = mo.group(3) 292 | 293 | else: 294 | # HEX: no tags 295 | pieces["closest-tag"] = None 296 | count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], 297 | cwd=root) 298 | pieces["distance"] = int(count_out) # total number of commits 299 | 300 | # commit date: see ISO-8601 comment in git_versions_from_keywords() 301 | date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], 302 | cwd=root)[0].strip() 303 | pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) 304 | 305 | return pieces 306 | 307 | 308 | def plus_or_dot(pieces): 309 | """Return a + if we don't already have one, else return a .""" 310 | if "+" in pieces.get("closest-tag", ""): 311 | return "." 312 | return "+" 313 | 314 | 315 | def render_pep440(pieces): 316 | """Build up version string, with post-release "local version identifier". 317 | 318 | Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you 319 | get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty 320 | 321 | Exceptions: 322 | 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] 323 | """ 324 | if pieces["closest-tag"]: 325 | rendered = pieces["closest-tag"] 326 | if pieces["distance"] or pieces["dirty"]: 327 | rendered += plus_or_dot(pieces) 328 | rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) 329 | if pieces["dirty"]: 330 | rendered += ".dirty" 331 | else: 332 | # exception #1 333 | rendered = "0+untagged.%d.g%s" % (pieces["distance"], 334 | pieces["short"]) 335 | if pieces["dirty"]: 336 | rendered += ".dirty" 337 | return rendered 338 | 339 | 340 | def render_pep440_pre(pieces): 341 | """TAG[.post.devDISTANCE] -- No -dirty. 342 | 343 | Exceptions: 344 | 1: no tags. 0.post.devDISTANCE 345 | """ 346 | if pieces["closest-tag"]: 347 | rendered = pieces["closest-tag"] 348 | if pieces["distance"]: 349 | rendered += ".post.dev%d" % pieces["distance"] 350 | else: 351 | # exception #1 352 | rendered = "0.post.dev%d" % pieces["distance"] 353 | return rendered 354 | 355 | 356 | def render_pep440_post(pieces): 357 | """TAG[.postDISTANCE[.dev0]+gHEX] . 358 | 359 | The ".dev0" means dirty. Note that .dev0 sorts backwards 360 | (a dirty tree will appear "older" than the corresponding clean one), 361 | but you shouldn't be releasing software with -dirty anyways. 362 | 363 | Exceptions: 364 | 1: no tags. 0.postDISTANCE[.dev0] 365 | """ 366 | if pieces["closest-tag"]: 367 | rendered = pieces["closest-tag"] 368 | if pieces["distance"] or pieces["dirty"]: 369 | rendered += ".post%d" % pieces["distance"] 370 | if pieces["dirty"]: 371 | rendered += ".dev0" 372 | rendered += plus_or_dot(pieces) 373 | rendered += "g%s" % pieces["short"] 374 | else: 375 | # exception #1 376 | rendered = "0.post%d" % pieces["distance"] 377 | if pieces["dirty"]: 378 | rendered += ".dev0" 379 | rendered += "+g%s" % pieces["short"] 380 | return rendered 381 | 382 | 383 | def render_pep440_old(pieces): 384 | """TAG[.postDISTANCE[.dev0]] . 385 | 386 | The ".dev0" means dirty. 387 | 388 | Eexceptions: 389 | 1: no tags. 0.postDISTANCE[.dev0] 390 | """ 391 | if pieces["closest-tag"]: 392 | rendered = pieces["closest-tag"] 393 | if pieces["distance"] or pieces["dirty"]: 394 | rendered += ".post%d" % pieces["distance"] 395 | if pieces["dirty"]: 396 | rendered += ".dev0" 397 | else: 398 | # exception #1 399 | rendered = "0.post%d" % pieces["distance"] 400 | if pieces["dirty"]: 401 | rendered += ".dev0" 402 | return rendered 403 | 404 | 405 | def render_git_describe(pieces): 406 | """TAG[-DISTANCE-gHEX][-dirty]. 407 | 408 | Like 'git describe --tags --dirty --always'. 409 | 410 | Exceptions: 411 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) 412 | """ 413 | if pieces["closest-tag"]: 414 | rendered = pieces["closest-tag"] 415 | if pieces["distance"]: 416 | rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) 417 | else: 418 | # exception #1 419 | rendered = pieces["short"] 420 | if pieces["dirty"]: 421 | rendered += "-dirty" 422 | return rendered 423 | 424 | 425 | def render_git_describe_long(pieces): 426 | """TAG-DISTANCE-gHEX[-dirty]. 427 | 428 | Like 'git describe --tags --dirty --always -long'. 429 | The distance/hash is unconditional. 430 | 431 | Exceptions: 432 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) 433 | """ 434 | if pieces["closest-tag"]: 435 | rendered = pieces["closest-tag"] 436 | rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) 437 | else: 438 | # exception #1 439 | rendered = pieces["short"] 440 | if pieces["dirty"]: 441 | rendered += "-dirty" 442 | return rendered 443 | 444 | 445 | def render(pieces, style): 446 | """Render the given version pieces into the requested style.""" 447 | if pieces["error"]: 448 | return {"version": "unknown", 449 | "full-revisionid": pieces.get("long"), 450 | "dirty": None, 451 | "error": pieces["error"], 452 | "date": None} 453 | 454 | if not style or style == "default": 455 | style = "pep440" # the default 456 | 457 | if style == "pep440": 458 | rendered = render_pep440(pieces) 459 | elif style == "pep440-pre": 460 | rendered = render_pep440_pre(pieces) 461 | elif style == "pep440-post": 462 | rendered = render_pep440_post(pieces) 463 | elif style == "pep440-old": 464 | rendered = render_pep440_old(pieces) 465 | elif style == "git-describe": 466 | rendered = render_git_describe(pieces) 467 | elif style == "git-describe-long": 468 | rendered = render_git_describe_long(pieces) 469 | else: 470 | raise ValueError("unknown style '%s'" % style) 471 | 472 | return {"version": rendered, "full-revisionid": pieces["long"], 473 | "dirty": pieces["dirty"], "error": None, 474 | "date": pieces.get("date")} 475 | 476 | 477 | def get_versions(): 478 | """Get version information or return default if unable to do so.""" 479 | # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have 480 | # __file__, we can work backwards from there to the root. Some 481 | # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which 482 | # case we can only use expanded keywords. 483 | 484 | cfg = get_config() 485 | verbose = cfg.verbose 486 | 487 | try: 488 | return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, 489 | verbose) 490 | except NotThisMethod: 491 | pass 492 | 493 | try: 494 | root = os.path.realpath(__file__) 495 | # versionfile_source is the relative path from the top of the source 496 | # tree (where the .git directory might live) to this file. Invert 497 | # this to find the root from __file__. 498 | for i in cfg.versionfile_source.split('/'): 499 | root = os.path.dirname(root) 500 | except NameError: 501 | return {"version": "0+unknown", "full-revisionid": None, 502 | "dirty": None, 503 | "error": "unable to find root of source tree", 504 | "date": None} 505 | 506 | try: 507 | pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) 508 | return render(pieces, cfg.style) 509 | except NotThisMethod: 510 | pass 511 | 512 | try: 513 | if cfg.parentdir_prefix: 514 | return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) 515 | except NotThisMethod: 516 | pass 517 | 518 | return {"version": "0+unknown", "full-revisionid": None, 519 | "dirty": None, 520 | "error": "unable to compute version", "date": None} 521 | -------------------------------------------------------------------------------- /kernelmethods/algorithms.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Module to gather various high-level algorithms based on the kernel methods, 4 | such as kernel-based predictive models for classification and regression. 5 | 6 | """ 7 | 8 | from abc import abstractmethod 9 | from copy import deepcopy 10 | 11 | import numpy as np 12 | from sklearn.base import (BaseEstimator, ClassifierMixin, RegressorMixin, 13 | is_classifier, is_regressor) 14 | from sklearn.exceptions import NotFittedError 15 | from sklearn.svm import SVC, SVR 16 | from sklearn.utils.validation import check_X_y, check_array 17 | 18 | from kernelmethods import config as cfg 19 | from kernelmethods.base import KernelMatrix 20 | from kernelmethods.numeric_kernels import GaussianKernel 21 | from kernelmethods.ranking import find_optimal_kernel, get_estimator 22 | from kernelmethods.sampling import KernelBucket, make_kernel_bucket 23 | 24 | 25 | class BaseKernelMachine(BaseEstimator): 26 | """Generic class to return a drop-in sklearn estimator. 27 | 28 | Parameters 29 | ---------- 30 | k_func : KernelFunction 31 | The kernel function the kernel machine bases itself on 32 | 33 | learner_id : str 34 | Identifier for the estimator to be built based on the kernel function. 35 | Options: ``SVC`` and ``SVR``. 36 | Default: ``SVC`` (classifier version of SVM) 37 | 38 | normalized : flag 39 | Flag to indicate whether to keep the kernel matrix normalized 40 | Default: False 41 | 42 | """ 43 | 44 | 45 | def __init__(self, 46 | k_func=GaussianKernel(), 47 | learner_id='SVC', 48 | normalized=False): 49 | """ 50 | Constructor for the KernelMachine class. 51 | 52 | Parameters 53 | ---------- 54 | k_func : KernelFunction 55 | The kernel function the kernel machine bases itself on 56 | 57 | learner_id : str 58 | Identifier for the estimator to be built based on the kernel function. 59 | Options: ``SVC`` and ``SVR``. 60 | Default: ``SVC`` (classifier version of SVM) 61 | 62 | normalized : flag 63 | Flag to indicate whether to keep the kernel matrix normalized. 64 | Default: False 65 | """ 66 | 67 | self.k_func = k_func 68 | self.learner_id = learner_id 69 | self.normalized = normalized 70 | 71 | 72 | def fit(self, X, y, sample_weight=None): 73 | """Fit the chosen Estimator based on the user-defined kernel. 74 | 75 | Parameters 76 | ---------- 77 | X : {array-like, sparse matrix}, shape (n_samples, n_features) 78 | Training vectors, where n_samples is the number of samples 79 | and n_features is the number of features. 80 | 81 | y : array-like, shape (n_samples,) 82 | Target values (class labels in classification, real numbers in 83 | regression) 84 | 85 | sample_weight : array-like, shape (n_samples,) 86 | Per-sample weights. Rescale C per sample. Higher weights 87 | force the classifier to put more emphasis on these points. 88 | 89 | Returns 90 | ------- 91 | self : object 92 | 93 | Notes 94 | ------ 95 | If X and y are not C-ordered and contiguous arrays of np.float64 and 96 | X is not a scipy.sparse.csr_matrix, X and/or y may be copied. 97 | 98 | If X is a dense array, then the other methods will not support sparse 99 | matrices as input. 100 | 101 | """ 102 | 103 | if is_regressor(self): 104 | self._train_X, self._train_y = check_X_y(X, y, y_numeric=True) 105 | self._train_y = self._train_y.astype(np.float_) 106 | else: 107 | self._train_X, self._train_y = check_X_y(X, y) 108 | 109 | self._km = KernelMatrix(self.k_func, name='train_km', 110 | normalized=self.normalized) 111 | self._km.attach_to(self._train_X) 112 | 113 | self._estimator, self.param_grid = get_estimator(self.learner_id) 114 | self._estimator.fit(X=self._km.full, y=self._train_y, 115 | sample_weight=sample_weight) 116 | 117 | if is_classifier(self): 118 | self.classes_ = self._estimator.classes_ 119 | 120 | return self 121 | 122 | 123 | def predict(self, X): 124 | """ 125 | Make predictions on the new samplets in X. 126 | 127 | For an one-class model, +1 or -1 is returned. 128 | 129 | Parameters 130 | ---------- 131 | X : {array-like, sparse matrix}, shape (n_samples, n_features) 132 | 133 | Returns 134 | ------- 135 | y_pred : array, shape (n_samples,) 136 | Class labels for samples in X. 137 | """ 138 | 139 | if not hasattr(self, '_km'): 140 | raise NotFittedError("Can't predict. Not fitted yet. Run .fit() first!") 141 | 142 | test_X = check_array(X) 143 | 144 | # this is a fresh new KM 145 | self._km = KernelMatrix(self.k_func, name='test_km', 146 | normalized=self.normalized) 147 | 148 | # sample_one must be test data to get the right shape for sklearn X 149 | self._km.attach_to(sample_one=test_X, sample_two=self._train_X) 150 | 151 | predicted_y = self._estimator.predict(self._km.full) 152 | 153 | return np.asarray(predicted_y, dtype=self._train_y.dtype) 154 | 155 | 156 | def get_params(self, deep=True): 157 | """returns all the relevant parameters for this estimator!""" 158 | 159 | return {'k_func' : self.k_func, 160 | 'normalized': self.normalized, 161 | 'learner_id': self.learner_id} 162 | 163 | 164 | def set_params(self, **parameters): 165 | """Param setter""" 166 | 167 | for parameter, value in parameters.items(): 168 | if parameter in ('k_func', 'learner_id', 'normalized'): 169 | setattr(self, parameter, value) 170 | 171 | return self 172 | 173 | 174 | def _more_tags(self): 175 | """Handling specific cases with tags""" 176 | 177 | from kernelmethods.numeric_kernels import Chi2Kernel, SigmoidKernel, \ 178 | HadamardKernel 179 | if isinstance(self.k_func, Chi2Kernel): 180 | return {'requires_positive_X': True} 181 | elif isinstance(self.k_func, (SigmoidKernel, HadamardKernel)): 182 | return {'poor_score': True} 183 | else: 184 | return dict() 185 | 186 | 187 | class KernelMachine(BaseKernelMachine, ClassifierMixin): 188 | """Classifier version of the KernelMachine""" 189 | 190 | 191 | class KernelMachineRegressor(BaseKernelMachine, RegressorMixin): 192 | """Regressor version of the KernelMachine 193 | 194 | Parameters 195 | ---------- 196 | k_func : KernelFunction 197 | The kernel function the kernel machine bases itself on 198 | 199 | learner_id : str 200 | Identifier for the estimator to be built based on the kernel function. 201 | Options: ``SVR``. 202 | Default: ``SVR`` (regressor version of SVM) 203 | 204 | normalized : flag 205 | Flag to indicate whether to keep the kernel matrix normalized 206 | Default: False 207 | 208 | """ 209 | 210 | 211 | def __init__(self, 212 | k_func=GaussianKernel(), 213 | learner_id='SVR', 214 | normalized=False): 215 | """ 216 | Constructor for the regressor version of the KernelMachine 217 | 218 | Parameters 219 | ---------- 220 | k_func : KernelFunction 221 | The kernel function the kernel machine bases itself on 222 | 223 | learner_id : str 224 | Identifier for the estimator to be built based on the kernel function. 225 | Options: ``SVR`` 226 | Default: ``SVR`` (regressor version of SVM) 227 | 228 | normalized : flag 229 | Flag to indicate whether to keep the kernel matrix normalized. 230 | Default: False 231 | """ 232 | 233 | self.k_func = k_func 234 | self.learner_id = learner_id 235 | self.normalized = normalized 236 | 237 | 238 | class BaseOptimalKernelMachine(BaseEstimator): 239 | """ 240 | An estimator to learn the optimal kernel for a given sample and 241 | build a support vector regressor based on this custom kernel. 242 | 243 | This class is wrapped around the sklearn SVR estimator to function as its 244 | drop-in replacement, whose implementation is in turn based on LIBSVM. 245 | 246 | Parameters 247 | ---------- 248 | 249 | k_bucket : KernelBucket or str 250 | An instance of KernelBucket that contains all the kernels to be compared, 251 | or a string identifying the sampling_strategy which populates a KernelBucket. 252 | 253 | method : str 254 | Scoring method to rank different kernels 255 | 256 | C : float, optional (default=1.0) 257 | Penalty parameter C of the error term. 258 | 259 | epsilon : float, optional (default=0.1) 260 | Epsilon in the epsilon-SVR model. It specifies the epsilon-tube 261 | within which no penalty is associated in the training loss function 262 | with points predicted within a distance epsilon from the actual 263 | value. 264 | 265 | tol : float, optional (default=1e-3) 266 | Tolerance for stopping criterion. 267 | 268 | shrinking : boolean, optional (default=True) 269 | Whether to use the shrinking heuristic. 270 | 271 | 272 | Attributes 273 | ---------- 274 | support_ : array-like, shape = [n_SV] 275 | Indices of support vectors. 276 | 277 | support_vectors_ : array-like, shape = [nSV, n_features] 278 | Support vectors. 279 | 280 | dual_coef_ : array, shape = [1, n_SV] 281 | Coefficients of the support vector in the decision function. 282 | 283 | coef_ : array, shape = [1, n_features] 284 | Weights assigned to the features (coefficients in the primal 285 | problem). This is only available in the case of a linear kernel. 286 | 287 | `coef_` is readonly property derived from `dual_coef_` and 288 | `support_vectors_`. 289 | 290 | intercept_ : array, shape = [1] 291 | Constants in decision function. 292 | 293 | """ 294 | 295 | 296 | @abstractmethod 297 | def _find_optimal_kernel(self): 298 | """Method to find the optimal kernel 299 | 300 | Given a kernel bucket, a training sample and a ranking method. To be 301 | defined by the child class, appropriate for their task i.e. classification 302 | or regression 303 | """ 304 | 305 | 306 | def fit(self, X, y, sample_weight=None): 307 | """Estimate the optimal kernel, and fit a SVM based on the custom kernel. 308 | 309 | Parameters 310 | ---------- 311 | X : {array-like, sparse matrix}, shape (n_samples, n_features) 312 | Training vectors, where n_samples is the number of samples 313 | and n_features is the number of features. 314 | 315 | y : array-like, shape (n_samples,) 316 | Target values (class labels in classification, real numbers in 317 | regression) 318 | 319 | sample_weight : array-like, shape (n_samples,) 320 | Per-sample weights. Rescale C per sample. Higher weights 321 | force the classifier to put more emphasis on these points. 322 | 323 | Returns 324 | ------- 325 | self : object 326 | 327 | Notes 328 | ------ 329 | If X and y are not C-ordered and contiguous arrays of np.float64 and 330 | X is not a scipy.sparse.csr_matrix, X and/or y may be copied. 331 | 332 | If X is a dense array, then the other methods will not support sparse 333 | matrices as input. 334 | 335 | """ 336 | 337 | if isinstance(self.k_bucket, str): 338 | try: 339 | # using a new internal variable to retain user supplied param 340 | self._k_bucket = make_kernel_bucket(self.k_bucket) 341 | except: 342 | raise ValueError('Input for k_func can only an instance of ' 343 | 'KernelBucket or a sampling strategy to generate ' 344 | 'one with make_kernel_bucket.' 345 | 'sampling strategy must be one of {}' 346 | ''.format(cfg.kernel_bucket_strategies)) 347 | elif isinstance(self.k_bucket, KernelBucket): 348 | self._k_bucket = deepcopy(self.k_bucket) 349 | else: 350 | raise ValueError('Input for k_func can only an instance of ' 351 | 'KernelBucket or a sampling strategy to generate ' 352 | 'one with make_kernel_bucket') 353 | 354 | self._train_X, self._train_y = check_X_y(X, y, y_numeric=True) 355 | 356 | self.opt_kernel_ = self._find_optimal_kernel() 357 | 358 | super().fit(X=self.opt_kernel_.full, y=self._train_y, 359 | sample_weight=sample_weight) 360 | 361 | # temporary hack to pass sklearn estimator checks till a bug is fixed 362 | # for more see: https://github.com/scikit-learn/scikit-learn/issues/14712 363 | self.n_iter_ = 1 364 | 365 | return self 366 | 367 | 368 | def predict(self, X): 369 | """ 370 | Perform classification on samples in X. 371 | 372 | For an one-class model, +1 or -1 is returned. 373 | 374 | Parameters 375 | ---------- 376 | X : {array-like, sparse matrix}, shape (n_samples, n_features) 377 | 378 | Returns 379 | ------- 380 | y_pred : array, shape (n_samples,) 381 | Class labels for samples in X. 382 | """ 383 | 384 | if not hasattr(self, 'opt_kernel_'): 385 | raise NotFittedError("Can't predict. Not fitted yet. Run .fit() first!") 386 | 387 | X = check_array(X) 388 | 389 | # sample_one must be test data to get the right shape for sklearn X 390 | self.opt_kernel_.attach_to(sample_one=X, sample_two=self._train_X) 391 | test_train_KM = self.opt_kernel_.full 392 | predicted_y = super().predict(test_train_KM) 393 | 394 | # data type coversion is done in child class, esp. for classification 395 | # return np.asarray(predicted_y, dtype=np.intp) 396 | return predicted_y 397 | 398 | 399 | @abstractmethod 400 | def get_params(self, deep=True): 401 | """returns all the relevant parameters for this estimator!""" 402 | 403 | # example code, for future reference 404 | return {'k_bucket' : self.k_bucket, 405 | 'method' : self.method, 406 | 'C' : self.C, 407 | 'epsilon' : self.epsilon, 408 | 'shrinking': self.shrinking, 409 | 'tol' : self.tol} 410 | 411 | 412 | @abstractmethod 413 | def set_params(self, **parameters): 414 | """Param setter""" 415 | 416 | # example code, for future reference 417 | for parameter, value in parameters.items(): 418 | if parameter in ('k_bucket', 'method', 419 | 'C', 'epsilon', 'shrinking', 'tol'): 420 | setattr(self, parameter, value) 421 | 422 | return self 423 | 424 | 425 | class OptimalKernelSVR(BaseOptimalKernelMachine, SVR): 426 | """ 427 | An estimator to learn the optimal kernel for a given sample and 428 | build a support vector regressor based on this custom kernel. 429 | 430 | This class is wrapped around the sklearn SVR estimator to function as its 431 | drop-in replacement, whose implementation is in turn based on LIBSVM. 432 | 433 | Parameters 434 | ---------- 435 | 436 | k_bucket : KernelBucket or str 437 | An instance of KernelBucket that contains all the kernels to be compared, 438 | or a string identifying the sampling_strategy which populates a KernelBucket. 439 | 440 | method : str 441 | Scoring method to rank different kernels 442 | 443 | C : float, optional (default=1.0) 444 | Penalty parameter C of the error term. 445 | 446 | epsilon : float, optional (default=0.1) 447 | Epsilon in the epsilon-SVR model. It specifies the epsilon-tube 448 | within which no penalty is associated in the training loss function 449 | with points predicted within a distance epsilon from the actual 450 | value. 451 | 452 | tol : float, optional (default=1e-3) 453 | Tolerance for stopping criterion. 454 | 455 | shrinking : boolean, optional (default=True) 456 | Whether to use the shrinking heuristic. 457 | 458 | 459 | Attributes 460 | ---------- 461 | support_ : array-like, shape = [n_SV] 462 | Indices of support vectors. 463 | 464 | support_vectors_ : array-like, shape = [nSV, n_features] 465 | Support vectors. 466 | 467 | dual_coef_ : array, shape = [1, n_SV] 468 | Coefficients of the support vector in the decision function. 469 | 470 | coef_ : array, shape = [1, n_features] 471 | Weights assigned to the features (coefficients in the primal 472 | problem). This is only available in the case of a linear kernel. 473 | 474 | `coef_` is readonly property derived from `dual_coef_` and 475 | `support_vectors_`. 476 | 477 | intercept_ : array, shape = [1] 478 | Constants in decision function. 479 | 480 | """ 481 | 482 | 483 | def __init__(self, 484 | k_bucket='exhaustive', 485 | method='cv_risk', 486 | C=1.0, 487 | epsilon=0.1, 488 | shrinking=True, 489 | tol=1e-3): 490 | """ 491 | 492 | Parameters 493 | ---------- 494 | k_bucket : KernelBucket or str 495 | An instance of KernelBucket that contains all the kernels to be compared, 496 | or a string identifying sampling strategy to populate a KernelBucket. 497 | 498 | method : str 499 | Scoring method to rank different kernels 500 | 501 | C : float, optional (default=1.0) 502 | Penalty parameter C of the error term. 503 | 504 | epsilon : float, optional (default=0.1) 505 | Epsilon in the epsilon-SVR model. It specifies the epsilon-tube 506 | within which no penalty is associated in the training loss function 507 | with points predicted within a distance epsilon from the actual 508 | value. 509 | 510 | shrinking : boolean, optional (default=True) 511 | Whether to use the shrinking heuristic. 512 | 513 | tol : float, optional (default=1e-3) 514 | Tolerance for stopping criterion. 515 | 516 | """ 517 | 518 | # not init'ing SVC/SVR with kernel='precomputed' to avoid issues with 519 | # cross_val_score and safe_split 520 | super().__init__(C=C, epsilon=epsilon, shrinking=shrinking, tol=tol) 521 | 522 | self.k_bucket = k_bucket 523 | self.method = method 524 | self.C = C 525 | self.epsilon = epsilon 526 | self.shrinking = shrinking 527 | self.tol = tol 528 | 529 | 530 | def _find_optimal_kernel(self): 531 | """Method to find the optimal kernel""" 532 | 533 | self._opt_ker_search_est_name = 'SVR' 534 | 535 | return find_optimal_kernel(self._k_bucket, 536 | self._train_X, self._train_y, 537 | method=self.method, 538 | estimator_name=self._opt_ker_search_est_name) 539 | 540 | 541 | def get_params(self, deep=True): 542 | """returns all the relevant parameters for this estimator!""" 543 | 544 | return {'k_bucket' : self.k_bucket, 545 | 'method' : self.method, 546 | 'C' : self.C, 547 | 'epsilon' : self.epsilon, 548 | 'shrinking': self.shrinking, 549 | 'tol' : self.tol} 550 | 551 | 552 | def set_params(self, **parameters): 553 | """Param setter""" 554 | 555 | for parameter, value in parameters.items(): 556 | if parameter in ('k_bucket', 'method', 557 | 'C', 'epsilon', 'shrinking', 'tol'): 558 | setattr(self, parameter, value) 559 | 560 | return self 561 | 562 | 563 | class OptimalKernelSVC(BaseOptimalKernelMachine, SVC): 564 | """ 565 | An estimator to learn the optimal kernel for a given sample and 566 | build a support vector classifier based on this custom kernel. 567 | 568 | This class is wrapped around the sklearn SVC estimator to function as its 569 | drop-in replacement, whose implementation is in turn based on LIBSVM. 570 | 571 | Parameters 572 | ---------- 573 | 574 | k_bucket : KernelBucket or str 575 | An instance of KernelBucket that contains all the kernels to be compared, 576 | or a string identifying the sampling_strategy which populates a KernelBucket. 577 | 578 | method : str 579 | Scoring method to rank different kernels 580 | 581 | C : float, optional (default=1.0) 582 | Penalty parameter C of the error term. 583 | 584 | tol : float, optional (default=1e-3) 585 | Tolerance for stopping criterion. 586 | 587 | shrinking : boolean, optional (default=True) 588 | Whether to use the shrinking heuristic. 589 | 590 | 591 | Attributes 592 | ---------- 593 | support_ : array-like, shape = [n_SV] 594 | Indices of support vectors. 595 | 596 | support_vectors_ : array-like, shape = [nSV, n_features] 597 | Support vectors. 598 | 599 | dual_coef_ : array, shape = [1, n_SV] 600 | Coefficients of the support vector in the decision function. 601 | 602 | coef_ : array, shape = [1, n_features] 603 | Weights assigned to the features (coefficients in the primal 604 | problem). This is only available in the case of a linear kernel. 605 | 606 | `coef_` is readonly property derived from `dual_coef_` and 607 | `support_vectors_`. 608 | 609 | intercept_ : array, shape = [1] 610 | Constants in decision function. 611 | 612 | """ 613 | 614 | 615 | def __init__(self, k_bucket='exhaustive', 616 | method='cv_risk', 617 | C=1.0, 618 | shrinking=True, 619 | tol=1e-3): 620 | """ 621 | SVC classifier trained with the sample-wise optimal kernel 622 | 623 | Parameters 624 | ---------- 625 | k_bucket : KernelBucket or str 626 | An instance of KernelBucket that contains all the kernels to be compared, 627 | or a string identifying sampling strategy to populate a KernelBucket. 628 | 629 | method : str 630 | Scoring method to rank different kernels 631 | 632 | C : float, optional (default=1.0) 633 | Penalty parameter C of the error term. 634 | 635 | shrinking : boolean, optional (default=True) 636 | Whether to use the shrinking heuristic. 637 | 638 | tol : float, optional (default=1e-3) 639 | Tolerance for stopping criterion. 640 | 641 | """ 642 | 643 | # not init'ing SVC/SVR with kernel='precomputed' to avoid issues with 644 | # cross_val_score and safe_split 645 | super().__init__(C=C, shrinking=shrinking, tol=tol) 646 | 647 | self.k_bucket = k_bucket 648 | self.method = method 649 | self.C = C 650 | self.shrinking = shrinking 651 | self.tol = tol 652 | 653 | 654 | def _find_optimal_kernel(self): 655 | """Method to find the optimal kernel""" 656 | 657 | self._opt_ker_search_est_name = 'SVC' 658 | 659 | return find_optimal_kernel(self._k_bucket, 660 | self._train_X, self._train_y, 661 | method=self.method, 662 | estimator_name=self._opt_ker_search_est_name) 663 | 664 | 665 | def predict(self, X): 666 | """ 667 | Perform classification on samples in X. 668 | 669 | For an one-class model, +1 or -1 is returned. 670 | 671 | Parameters 672 | ---------- 673 | X : {array-like, sparse matrix}, shape (n_samples, n_features) 674 | 675 | Returns 676 | ------- 677 | y_pred : array, shape (n_samples,) 678 | Class labels for samples in X. 679 | """ 680 | 681 | predicted_y = super().predict(X) 682 | # casting output type to integers 683 | return np.asarray(predicted_y, dtype=np.intp) 684 | 685 | 686 | def get_params(self, deep=True): 687 | """returns all the relevant parameters for this estimator!""" 688 | 689 | return {'k_bucket' : self.k_bucket, 690 | 'method' : self.method, 691 | 'C' : self.C, 692 | 'shrinking': self.shrinking, 693 | 'tol' : self.tol} 694 | 695 | 696 | def set_params(self, **parameters): 697 | """Param setter""" 698 | 699 | for parameter, value in parameters.items(): 700 | if parameter in ('k_bucket', 'method', 701 | 'C', 'shrinking', 'tol'): 702 | setattr(self, parameter, value) 703 | 704 | return self 705 | -------------------------------------------------------------------------------- /kernelmethods/categorical.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Module for categorical kernels 4 | 5 | Please refer to the following papers and theses for more details: 6 | 7 | - Villegas García, Marco Antonio. "An investigation into new kernels for 8 | categorical variables." Master's thesis, Universitat Politècnica de Catalunya, 9 | 2013. 10 | 11 | 12 | """ 13 | 14 | import numpy as np 15 | 16 | from kernelmethods.base import BaseKernelFunction 17 | from kernelmethods.utils import check_input_arrays 18 | from kernelmethods import config as cfg 19 | 20 | 21 | class MatchCountKernel(BaseKernelFunction): 22 | """ 23 | Categorical kernel measuring similarity via the number of matching categorical 24 | dimensions. 25 | 26 | Parameters 27 | ---------- 28 | 29 | return_perc : bool 30 | If True, the return value would be normalized by the number of dimensions. 31 | 32 | References 33 | ---------- 34 | 35 | Villegas García, Marco A., "An investigation into new kernels for categorical 36 | variables." Master's thesis, Universitat Politècnica de Catalunya, 2013. 37 | 38 | """ 39 | 40 | 41 | def __init__(self, 42 | return_perc=True, 43 | skip_input_checks=False): 44 | """Constructor.""" 45 | 46 | self.return_perc = return_perc 47 | if self.return_perc: 48 | super().__init__('MatchPerc') 49 | else: 50 | super().__init__('MatchCount') 51 | 52 | self.skip_input_checks = skip_input_checks 53 | 54 | 55 | def __call__(self, vec_c, vec_d): 56 | """ 57 | Actual implementation of the kernel func. 58 | 59 | Parameters 60 | ---------- 61 | 62 | vec_c, vec_d : array of equal-sized categorical variables 63 | 64 | """ 65 | 66 | vec_c, vec_d = _check_categorical_arrays(vec_c, vec_d) 67 | 68 | if not np.issubdtype(vec_c.dtype, cfg.dtype_categorical) or \ 69 | not np.issubdtype(vec_d.dtype, cfg.dtype_categorical): 70 | raise TypeError('Categorical kernels require str or unicode dtype') 71 | 72 | match_count = np.sum(vec_c==vec_d) 73 | 74 | if self.return_perc: 75 | return match_count / len(vec_d) 76 | else: 77 | return match_count 78 | 79 | 80 | def __str__(self): 81 | """human readable repr""" 82 | 83 | return self.name 84 | 85 | 86 | def _check_categorical_arrays(x, y): 87 | """ 88 | Ensures the inputs are 89 | 1) 1D arrays (not matrices) 90 | 2) with compatible size 91 | 3) of categorical data type 92 | and hence are safe to operate on. 93 | 94 | This is a variation of utils.check_input_arrays() to accommodate the special 95 | needs for categorical dtype, where we do not have lists of 96 | originally numbers/bool data to be converted to strings, and assume they are 97 | categorical. 98 | 99 | Parameters 100 | ---------- 101 | x : iterable 102 | y : iterable 103 | 104 | Returns 105 | ------- 106 | x : ndarray 107 | y : ndarray 108 | """ 109 | 110 | x = _ensure_type_size(x, ensure_num_dim=1) 111 | y = _ensure_type_size(y, ensure_num_dim=1) 112 | 113 | if x.size != y.size: 114 | raise ValueError('x (n={}) and y (n={}) differ in size! ' 115 | 'They must be of same length'.format(x.size, y.size)) 116 | 117 | return x, y 118 | 119 | 120 | def _ensure_type_size(array, ensure_num_dim=1): 121 | """Checking type and size of arrays""" 122 | 123 | if not isinstance(array, np.ndarray): 124 | array = np.squeeze(np.asarray(array)) 125 | 126 | if array.ndim != ensure_num_dim: 127 | raise ValueError('array must be {}-dimensional! ' 128 | 'It has {} dims with shape {} ' 129 | ''.format(ensure_num_dim, array.ndim, array.shape)) 130 | 131 | return array 132 | -------------------------------------------------------------------------------- /kernelmethods/config.py: -------------------------------------------------------------------------------- 1 | from operator import add, mul 2 | import numpy as np 3 | 4 | class KernelMethodsException(Exception): 5 | """ 6 | Generic exception to indicate invalid use of the ``kernelmethods`` library. 7 | 8 | Allows to distinguish improper use of KernelMatrix from other code exceptions 9 | """ 10 | pass 11 | 12 | 13 | class KMAccessError(KernelMethodsException): 14 | """Exception to indicate invalid access to the kernel matrix elements!""" 15 | pass 16 | 17 | 18 | class KMNormError(KernelMethodsException): 19 | """Custom exception to indicate error during normalization of kernel matrix""" 20 | pass 21 | 22 | 23 | class KMSetAdditionError(KernelMethodsException): 24 | """Exception to indicate invalid addition of kernel matrix to a KernelSet""" 25 | pass 26 | 27 | 28 | class KernelMethodsWarning(Warning): 29 | """Custom warning to indicate kernelmethods-specific warning!""" 30 | pass 31 | 32 | 33 | class Chi2NegativeValuesException(KernelMethodsException): 34 | """Custom exception to indicate Chi^2 kernel requires non-negative values""" 35 | pass 36 | 37 | 38 | VALID_KERNEL_MATRIX_OPS = ('sum', 'product', 'average') 39 | 40 | OPER_KM_OPS = {'sum' : add, 41 | 'product': mul} 42 | 43 | 44 | # default values and ranges 45 | 46 | kernel_bucket_strategies = ('exhaustive', 'light', 'linear_only') 47 | # strategy: exhaustive 48 | default_degree_values_poly_kernel = (2, 3, 4) 49 | default_sigma_values_gaussian_kernel = tuple([2**exp for exp in range(-5, 6, 2)]) 50 | default_gamma_values_laplacian_kernel = tuple([2**exp for exp in range(-5, 7, 2)]) 51 | default_gamma_values_sigmoid_kernel = tuple([2**exp for exp in range(-5, 7, 2)]) 52 | default_offset_values_sigmoid_kernel = tuple([-2.0, 1.0, 2.0]) 53 | 54 | # light 55 | light_degree_values_poly_kernel = (2, 3, ) 56 | light_sigma_values_gaussian_kernel = tuple([2**exp for exp in range(-3, 3, 2)]) 57 | light_gamma_values_laplacian_kernel = tuple([2**exp for exp in range(-3, 3, 2)]) 58 | light_gamma_values_sigmoid_kernel = tuple([2**exp for exp in range(-3, 7, 2)]) 59 | light_offset_values_sigmoid_kernel = tuple([1.0, ]) 60 | 61 | # ranking 62 | 63 | VALID_RANKING_METHODS = ("align/corr", "cv_risk") 64 | 65 | # controls the precision for kernel_matrix elements 66 | km_dtype = np.dtype('f8') 67 | 68 | # categorical variables 69 | dtype_categorical = np.unicode_ 70 | 71 | -------------------------------------------------------------------------------- /kernelmethods/numeric_kernels.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from kernelmethods.base import BaseKernelFunction 3 | from kernelmethods.config import Chi2NegativeValuesException 4 | from kernelmethods.utils import _ensure_min_eps, check_input_arrays 5 | 6 | 7 | # TODO special handling for sparse arrays 8 | # (e.g. custom dot product during kernel evaluation might be more efficient 9 | 10 | 11 | class HadamardKernel(BaseKernelFunction): 12 | """Hadamard kernel function 13 | 14 | Formula:: 15 | K_a(x, y) = \Sum_k {|x_k|^a * |y_k|^a} / {2*(|x_k|^a + |y_k|^a)} 16 | 17 | Alpha (a) must be non-zero. 18 | Hadamard kernel is not always PSD. 19 | 20 | Parameters 21 | ---------- 22 | alpha : int 23 | degree to raise the inner product 24 | 25 | skip_input_checks : bool 26 | Flag to skip input validation to save time. 27 | Skipping validation is strongly discouraged for normal use, 28 | unless you know exactly what you are doing (expert users). 29 | 30 | Raises 31 | ------ 32 | ValueError 33 | If Alpha is zero. 34 | 35 | """ 36 | 37 | 38 | def __init__(self, alpha=3, skip_input_checks=False): 39 | """ 40 | Constructor 41 | 42 | Parameters 43 | ---------- 44 | alpha : int 45 | degree to raise the inner product 46 | 47 | skip_input_checks : bool 48 | Flag to skip input validation to save time. 49 | Skipping validation is strongly discouraged for normal use, 50 | unless you know exactly what you are doing (expert users). 51 | 52 | """ 53 | 54 | super().__init__(name='Hadamard') 55 | 56 | if not np.isclose(alpha, 0.0): 57 | self.alpha = alpha 58 | else: 59 | raise ValueError('Alpha for Hadamard kernel must be non-zero') 60 | 61 | self.skip_input_checks = skip_input_checks 62 | 63 | 64 | def __call__(self, x, y): 65 | """Actual implementation of kernel func""" 66 | 67 | if not self.skip_input_checks: 68 | x, y = check_input_arrays(x, y, ensure_dtype=np.number) 69 | 70 | abs_x_a = np.power(np.abs(x), self.alpha) 71 | abs_y_a = np.power(np.abs(y), self.alpha) 72 | 73 | return np.dot((abs_x_a * abs_y_a), 2 * (abs_x_a + abs_y_a)) 74 | 75 | 76 | def __str__(self): 77 | """human readable repr""" 78 | 79 | return "{}(alpha={})".format(self.name, self.alpha) 80 | 81 | 82 | class PolyKernel(BaseKernelFunction): 83 | """Polynomial kernel function 84 | 85 | Formula:: 86 | K(x, y) = ( b + gamma* )^degree 87 | 88 | Parameters 89 | ---------- 90 | degree : int 91 | degree to raise the inner product 92 | 93 | gamma : float 94 | scaling factor 95 | 96 | b : float 97 | intercept 98 | 99 | skip_input_checks : bool 100 | Flag to skip input validation to save time. 101 | Skipping validation is strongly discouraged for normal use, 102 | unless you know exactly what you are doing (expert users). 103 | """ 104 | 105 | 106 | def __init__(self, degree=3, gamma=1.0, b=1.0, skip_input_checks=False): 107 | """ 108 | Constructor 109 | 110 | Parameters 111 | ---------- 112 | degree : int 113 | degree to raise the inner product 114 | 115 | b : float 116 | intercept 117 | 118 | skip_input_checks : bool 119 | Flag to skip input validation to save time. 120 | Skipping validation is strongly discouraged for normal use, 121 | unless you know exactly what you are doing (expert users). 122 | 123 | """ 124 | 125 | super().__init__(name='polynomial') 126 | 127 | # TODO implement param check 128 | self.degree = degree 129 | self.gamma = gamma 130 | self.b = b 131 | 132 | self.skip_input_checks = skip_input_checks 133 | 134 | 135 | def __call__(self, x, y): 136 | """Actual implementation of kernel func""" 137 | 138 | if not self.skip_input_checks: 139 | x, y = check_input_arrays(x, y, ensure_dtype=np.number) 140 | 141 | return (self.b + self.gamma * np.dot(x, y)) ** self.degree 142 | 143 | 144 | def __str__(self): 145 | """human readable repr""" 146 | 147 | return "{}(degree={},gamma={},b={})".format(self.name, self.degree, 148 | self.gamma, self.b) 149 | 150 | 151 | class GaussianKernel(BaseKernelFunction): 152 | """Gaussian kernel function 153 | 154 | Parameters 155 | ---------- 156 | sigma : float 157 | bandwidth 158 | 159 | skip_input_checks : bool 160 | Flag to skip input validation to save time. 161 | Skipping validation is strongly discouraged for normal use, 162 | unless you know exactly what you are doing (expert users). 163 | 164 | """ 165 | 166 | 167 | def __init__(self, sigma=2.0, skip_input_checks=False): 168 | """ 169 | Constructor 170 | 171 | Parameters 172 | ---------- 173 | sigma : float 174 | bandwidth 175 | 176 | skip_input_checks : bool 177 | Flag to skip input validation to save time. 178 | Skipping validation is strongly discouraged for normal use, 179 | unless you know exactly what you are doing (expert users). 180 | 181 | """ 182 | 183 | super().__init__(name='gaussian') 184 | 185 | # TODO implement param check 186 | # ensuring values of gamma/gamma is eps or larger to avoid zero division 187 | self.sigma = _ensure_min_eps(sigma) 188 | self.gamma = _ensure_min_eps(1.0 / (2 * self.sigma ** 2)) 189 | 190 | self.skip_input_checks = skip_input_checks 191 | 192 | 193 | def __call__(self, x, y): 194 | """Actual implementation of kernel func""" 195 | 196 | if not self.skip_input_checks: 197 | x, y = check_input_arrays(x, y, ensure_dtype=np.number) 198 | 199 | return np.exp(-self.gamma * np.linalg.norm(x - y, ord=2) ** 2) 200 | 201 | 202 | def __str__(self): 203 | """human readable repr""" 204 | 205 | return "{}(sigma={})".format(self.name, self.sigma) 206 | 207 | 208 | class LaplacianKernel(BaseKernelFunction): 209 | """Laplacian kernel function 210 | 211 | Parameters 212 | ---------- 213 | gamma : float 214 | scale factor 215 | 216 | skip_input_checks : bool 217 | Flag to skip input validation to save time. 218 | Skipping validation is strongly discouraged for normal use, 219 | unless you know exactly what you are doing (expert users). 220 | 221 | """ 222 | 223 | 224 | def __init__(self, gamma=1.0, skip_input_checks=False): 225 | """ 226 | Constructor 227 | 228 | Parameters 229 | ---------- 230 | gamma : float 231 | scale factor 232 | 233 | skip_input_checks : bool 234 | Flag to skip input validation to save time. 235 | Skipping validation is strongly discouraged for normal use, 236 | unless you know exactly what you are doing (expert users). 237 | 238 | """ 239 | 240 | super().__init__(name='laplacian') 241 | 242 | self.gamma = gamma 243 | 244 | self.skip_input_checks = skip_input_checks 245 | 246 | 247 | def __call__(self, x, y): 248 | """Actual implementation of kernel func""" 249 | 250 | if not self.skip_input_checks: 251 | x, y = check_input_arrays(x, y, ensure_dtype=np.number) 252 | 253 | return np.exp(-self.gamma * np.sum(np.abs(x - y))) 254 | 255 | 256 | def __str__(self): 257 | """human readable repr""" 258 | 259 | return "{}(gamma={})".format(self.name, self.gamma) 260 | 261 | 262 | class Chi2Kernel(BaseKernelFunction): 263 | """Chi-squared kernel function 264 | 265 | This kernel is implemented as:: 266 | 267 | k(x, y) = exp(-gamma Sum [(x - y)^2 / (x + y)]) 268 | 269 | x and y must have non-negative values (>=0). 270 | 271 | As a division is involved, when x+y is 0 or when x+y and x-y are both 0 for a 272 | particular dimension, the division results in a NaN, which is currently 273 | being ignored, by summing only non-NaN values. If your feature sets have many 274 | zeros, you may want investigate the effect of this kernel on your dataset 275 | carefully to ensure you understand this kernel meets your needs and 276 | expectations. 277 | 278 | Parameters 279 | ---------- 280 | gamma : float 281 | scale factor 282 | 283 | skip_input_checks : bool 284 | Flag to skip input validation to save time. 285 | Skipping validation is strongly discouraged for normal use, 286 | unless you know exactly what you are doing (expert users). 287 | 288 | """ 289 | 290 | 291 | def __init__(self, gamma=1.0, skip_input_checks=False): 292 | """ 293 | Constructor 294 | 295 | Parameters 296 | ---------- 297 | gamma : float 298 | scale factor 299 | 300 | skip_input_checks : bool 301 | Flag to skip input validation to save time. 302 | Skipping validation is strongly discouraged for normal use, 303 | unless you know exactly what you are doing (expert users). 304 | 305 | """ 306 | 307 | super().__init__(name='chi2') 308 | 309 | self.gamma = gamma 310 | 311 | self.skip_input_checks = skip_input_checks 312 | 313 | 314 | def __call__(self, x, y): 315 | """Actual implementation of kernel func""" 316 | 317 | if not self.skip_input_checks: 318 | x, y = check_input_arrays(x, y, ensure_dtype=np.float64) 319 | 320 | if (x < 0).any() or (y < 0).any(): 321 | raise Chi2NegativeValuesException( 322 | 'Chi^2 kernel requires non-negative values!' 323 | ' x or y contains non-negative values') 324 | 325 | # Note: NaNs due to Zero division are being ignored via np.nansum! 326 | value = np.exp(-self.gamma * np.nansum(np.power(x - y, 2) / (x + y))) 327 | 328 | return value 329 | 330 | 331 | def __str__(self): 332 | """human readable repr""" 333 | 334 | return "{}(gamma={})".format(self.name, self.gamma) 335 | 336 | 337 | class SigmoidKernel(BaseKernelFunction): 338 | """ 339 | Sigmoid kernel function (also known as hyperbolic tangent kernel) 340 | 341 | NOTE: This kernel is not always PSD, and normalizing its kernel matrix can 342 | result in numerical issues or errors. 343 | 344 | Parameters 345 | ---------- 346 | gamma : float 347 | scale factor 348 | 349 | offset : float 350 | value of offset/bias 351 | 352 | skip_input_checks : bool 353 | Flag to skip input validation to save time. 354 | Skipping validation is strongly discouraged for normal use, 355 | unless you know exactly what you are doing (expert users). 356 | 357 | """ 358 | 359 | 360 | def __init__(self, gamma=1.0, offset=1.0, skip_input_checks=False): 361 | """ 362 | Constructor 363 | 364 | Parameters 365 | ---------- 366 | gamma : float 367 | scale factor 368 | 369 | offset : float 370 | value of offset/bias 371 | 372 | skip_input_checks : bool 373 | Flag to skip input validation to save time. 374 | Skipping validation is strongly discouraged for normal use, 375 | unless you know exactly what you are doing (expert users). 376 | 377 | """ 378 | 379 | super().__init__(name='sigmoid') 380 | 381 | self.gamma = gamma 382 | self.offset = offset 383 | 384 | self.skip_input_checks = skip_input_checks 385 | 386 | 387 | def __call__(self, x, y): 388 | """Actual implementation of kernel func""" 389 | 390 | if not self.skip_input_checks: 391 | x, y = check_input_arrays(x, y, ensure_dtype=np.number) 392 | 393 | return np.tanh(self.offset + (self.gamma * np.dot(x, y))) 394 | 395 | 396 | def __str__(self): 397 | """human readable repr""" 398 | 399 | return "{}(gamma={},offset={})".format(self.name, self.gamma, self.offset) 400 | 401 | 402 | class LinearKernel(BaseKernelFunction): 403 | """Linear kernel function 404 | 405 | Parameters 406 | ---------- 407 | skip_input_checks : bool 408 | Flag to skip input validation to save time. 409 | Skipping validation is strongly discouraged for normal use, 410 | unless you know exactly what you are doing (expert users). 411 | """ 412 | 413 | 414 | def __init__(self, skip_input_checks=False): 415 | """ 416 | Constructor 417 | 418 | Parameters 419 | ---------- 420 | skip_input_checks : bool 421 | Flag to skip input validation to save time. 422 | Skipping validation is strongly discouraged for normal use, 423 | unless you know exactly what you are doing (expert users). 424 | 425 | """ 426 | 427 | super().__init__(name='linear') 428 | self.skip_input_checks = skip_input_checks 429 | 430 | 431 | def __call__(self, x, y): 432 | """Actual implementation of kernel func""" 433 | 434 | if not self.skip_input_checks: 435 | x, y = check_input_arrays(x, y, ensure_dtype=np.number) 436 | 437 | return x.dot(y.T) 438 | 439 | 440 | def __str__(self): 441 | """human readable repr""" 442 | 443 | return self.name 444 | 445 | 446 | DEFINED_KERNEL_FUNCS = (Chi2Kernel(), 447 | HadamardKernel(), 448 | PolyKernel(), 449 | GaussianKernel(), 450 | LaplacianKernel(), 451 | LinearKernel(), 452 | SigmoidKernel(), 453 | ) 454 | -------------------------------------------------------------------------------- /kernelmethods/operations.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | This module implements the common kernel operations such as 5 | 6 | - normalization of a kernel matrix (KM), 7 | - centering (one- and two-sample cases), 8 | - evaluating similarity, computing alignment, 9 | - frobenius norms, 10 | - linear combinations and 11 | - checking whether a KM is PSD. 12 | 13 | API 14 | ---- 15 | 16 | """ 17 | 18 | import traceback 19 | from warnings import warn 20 | 21 | import numpy as np 22 | from kernelmethods.config import KMNormError, KernelMethodsException 23 | from kernelmethods.utils import contains_nan_inf, ensure_ndarray_1D 24 | from numpy import multiply as elem_wise_multiply 25 | from scipy.linalg import LinAlgError, eigh 26 | 27 | 28 | def is_positive_semidefinite(sym_matrix, 29 | tolerance=1e-6, 30 | verbose=False): 31 | """ 32 | Tests whether a given matrix is positive-semidefinite (PSD). 33 | 34 | A symmetric matrix is PSD if ALL its eigen values >= 0 (non-negative). 35 | If any of its eigen values are negative, it is not PSD. 36 | 37 | This functions accounts for numerical instabilities with a tolerance parameter. 38 | 39 | This function can also be called with a shorthand ``is_PSD()`` 40 | 41 | Parameters 42 | ---------- 43 | sym_matrix : ndarray 44 | Matrix to be evaluted for PSDness 45 | 46 | tolerance : float 47 | Tolerance parameter to account for numerical instabilities in the eigen 48 | value computations (which can result in negative eigen values very slightly 49 | below 0) 50 | 51 | verbose : bool 52 | Flag to indicate whether to print traceback in case of errors 53 | during the computation of the eigen values 54 | 55 | Returns 56 | ------- 57 | psd : bool 58 | Flag indicating whether the matrix is PSD. 59 | 60 | """ 61 | 62 | if not isinstance(sym_matrix, np.ndarray): 63 | raise TypeError('Input matrix must be in numpy array format!') 64 | 65 | if sym_matrix.shape[0] != sym_matrix.shape[1]: 66 | warn('Input matrix is not square, and hence not PSD') 67 | return False 68 | 69 | if not np.isclose(sym_matrix, sym_matrix.T).all(): 70 | warn('Input matrix is not symmetric, and hence not PSD') 71 | return False 72 | 73 | try: 74 | eig_values = eigh(sym_matrix, eigvals_only=True) 75 | except LinAlgError: 76 | if verbose: 77 | traceback.print_exc() 78 | # we are not actually raising LinAlgError, just using it to categorize as 79 | # not PSD. So, can't use test cases to try raise LinAlgError, so not 80 | # testable! 81 | print('LinAlgError raised - eigen value computation failed --> not PSD') 82 | psd = False 83 | except: 84 | if verbose: 85 | traceback.print_exc() 86 | warn('Unknown exception during eigen value computation --> not PSD') 87 | psd = False 88 | else: 89 | if verbose: 90 | print('Smallest eigen values are:\n' 91 | '{}'.format(eig_values[:min(10, len(eig_values))])) 92 | if any(eig_values < -tolerance): # notice the negative sign before tolerance 93 | psd = False 94 | else: 95 | psd = True 96 | 97 | return psd 98 | 99 | 100 | # shorter alias 101 | is_PSD = is_positive_semidefinite 102 | 103 | 104 | def center_km(KM): 105 | """ 106 | Centers a given kernel matrix. 107 | 108 | Implements the definition according to Lemma 1 in Section 2.2 in 109 | Cortes, Corinna, Mehryar Mohri, and Afshin Rostamizadeh, 2012, "Algorithms for 110 | Learning Kernels Based on Centered Alignment", Journal of Machine Learning 111 | Research 13(Mar): 795–828. 112 | 113 | Parameters 114 | ---------- 115 | KM : ndarray 116 | Symmetric matrix to be centered. 117 | 118 | Returns 119 | ------- 120 | centered_km : ndarray 121 | Centered kernel matrix 122 | 123 | """ 124 | 125 | if isinstance(KM, np.ndarray): 126 | if KM.shape[0] == KM.shape[1]: 127 | n_rows = KM.shape[0] 128 | else: 129 | raise ValueError('Input matrix is not square!') 130 | else: 131 | raise ValueError('Unknown format for input matrix -' 132 | 'must be a square numpy ndarray') 133 | 134 | # directly initializing one_oneT without going through unnecessary matrix 135 | # products 136 | # vec_1s = np.ones((n_rows, 1)) # row vector of 1s 137 | # one_oneT = vec_1s.dot(vec_1s.T) # 1 dot 1T 138 | one_oneT = np.ones((n_rows, n_rows)) 139 | Ic = np.eye(n_rows) - (one_oneT / n_rows) 140 | 141 | return Ic.dot(KM).dot(Ic) 142 | 143 | 144 | def normalize_km(KM, method='cosine'): 145 | """ 146 | Normalize a kernel matrix to have unit diagonal. 147 | 148 | Cosine normalization normalizes the kernel matrix to have unit diagonal. 149 | Implements definition according to Section 5.1 in book (Page 113) 150 | Shawe-Taylor and Cristianini, "Kernels Methods for Pattern Analysis", 2004 151 | 152 | Matrix must be square (and coming from a single sample: K(X,X), not K(X,Y) 153 | 154 | Parameters 155 | ---------- 156 | KM : ndarray 157 | Symmetric matrix to be normalized 158 | 159 | method : str 160 | Method of normalization. Options: ``cosine`` only. 161 | 162 | Returns 163 | ------- 164 | normed_km : ndarray 165 | Normalized kernel matrix 166 | 167 | """ 168 | 169 | if KM.shape[0] != KM.shape[1]: 170 | raise ValueError('Input kernel matrix must be square! ' 171 | 'i.e. K(X,X) must be generated from ' 172 | 'inner products on a single sample X, ' 173 | 'not an inner-product on two separate samples X and Y') 174 | 175 | try: 176 | method = method.lower() 177 | if method == 'cosine': 178 | km_diag = KM.diagonal() 179 | if np.isclose(km_diag, 0.0).any(): 180 | raise KMNormError( 181 | 'Some diagnoal entries in KM are [close to] zero - ' 182 | ' this results in infinite or Nan values ' 183 | 'during Cosine normalization of KM!') 184 | # D = diag(1./sqrt(diag(K))) 185 | # normed_K = D * K * D; 186 | _1bySqrtDiag = np.diagflat(1 / np.sqrt(km_diag)) 187 | # notice @ is matrix multiplication operator 188 | normed_km = _1bySqrtDiag @ KM @ _1bySqrtDiag 189 | # in case of two samples K(X, Y), the left- and right-most factors 190 | # must come from K(X,X) & K(Y,Y) respectively: see normalize_km_2sample 191 | else: 192 | raise NotImplementedError('normalization method {} is not implemented' 193 | 'yet!'.format(method)) 194 | except (KMNormError, KernelMethodsException): 195 | raise 196 | except: 197 | warn('Unable to normalize kernel matrix using method {}'.format(method)) 198 | raise 199 | else: 200 | if contains_nan_inf(normed_km): 201 | warn('normalization of kernel matrix resulted in Inf / NaN ' 202 | 'values - check your parameters and data!') 203 | 204 | return normed_km 205 | 206 | 207 | def normalize_km_2sample(cross_K_XY, diag_K_XX, diag_K_YY, method='cosine'): 208 | """ 209 | Normalize a kernel matrix K(X,Y) to have unit diagonal. 210 | 211 | Cosine normalization normalizes the kernel matrix to have unit diagonal. 212 | Implements definition _similar_ to Section 5.1 in book (Page 113) 213 | Shawe-Taylor and Cristianini, "Kernels Methods for Pattern Analysis", 2004 214 | 215 | 216 | Parameters 217 | ---------- 218 | cross_K_XY : ndarray, 2D 219 | Matrix of inner-products for samples from X onto Y i.e. K(X,Y) 220 | 221 | diag_K_XX : array 222 | Diagonal from matrix of inner-products for samples from X onto itself i.e. 223 | K(X,X) 224 | K(X,X) must NOT be normalized (otherwise they will all be 1s) 225 | 226 | diag_K_YY : array 227 | Diagonal from matrix of inner-products for samples from Y onto itself i.e. 228 | K(Y,Y) 229 | 230 | Returns 231 | ------- 232 | normed_km : ndarray 233 | Normalized version of K(X,Y) 234 | 235 | NOTE: K_XY may NOT have unit diagonal, as k(x,y) != sqrt(k(x,x))*sqrt(k(y,y)) 236 | """ 237 | 238 | if diag_K_XX.size != cross_K_XY.shape[0] or \ 239 | cross_K_XY.shape[1] != diag_K_YY.size: 240 | raise ValueError('Shape mismatch for multiplication across the 3 kernel ' 241 | 'matrices! Length of diag_K_XX must match ' 242 | 'number of rows in K_XY, and number of columns in K_XY ' 243 | 'must match length of diag_K_XX.') 244 | 245 | method = method.lower() 246 | if method == 'cosine': 247 | if np.isclose(diag_K_XX, 0.0).any() or \ 248 | np.isclose(diag_K_YY, 0.0).any(): 249 | raise KMNormError( 250 | 'Some diagnoal entries in one of the KMs are [close to] zero - ' 251 | ' this results in infinite or Nan values ' 252 | 'during Cosine normalization of KM!') 253 | 254 | # using diagflat to explicitly construct a matrix from diag values 255 | diag_factor_xx = np.diagflat(1 / np.sqrt(diag_K_XX)) 256 | diag_factor_yy = np.diagflat(1 / np.sqrt(diag_K_YY)) 257 | # notice @ is matrix multiplication operator 258 | normed_km = diag_factor_xx @ cross_K_XY @ diag_factor_yy 259 | else: 260 | raise NotImplementedError('Two-sample normalization method {} is not' 261 | 'implemented yet!'.format(method)) 262 | 263 | return normed_km 264 | 265 | 266 | def frobenius_product(A, B): 267 | """ 268 | Computes the Frobenious product between two matrices of equal dimensions. 269 | 270 | _F is equal to the sum of element-wise products between A and B. 271 | 272 | .. math:: 273 | <\mathbf{A}, \mathbf{B}>_F = \sum_{i, j} \mathbf{A}_{ij} \mathbf{B}_{ij} 274 | 275 | Parameters 276 | ---------- 277 | A, B : ndarray 278 | Two matrices of equal dimensions to compute the product. 279 | 280 | Returns 281 | ------- 282 | product : float 283 | Frobenious product 284 | 285 | """ 286 | 287 | if A.shape != B.shape: 288 | raise ValueError('Dimensions of the two matrices must be the same ' 289 | 'to compute Frobenious product! They differ: {}, {}' 290 | ''.format(A.shape, B.shape)) 291 | 292 | return np.sum(elem_wise_multiply(A, B), axis=None) 293 | 294 | 295 | def frobenius_norm(A): 296 | """Computes the Frobenius norm of a matrix A, which is the square root of the 297 | Frobenius product with itself. 298 | 299 | Parameters 300 | ---------- 301 | A : ndarray 302 | Matrix to compute the norm of 303 | 304 | Returns 305 | ------- 306 | norm : float 307 | Frobenious norm 308 | 309 | """ 310 | 311 | return np.sqrt(frobenius_product(A, A)) 312 | 313 | 314 | def alignment_centered(km_one, km_two, 315 | value_if_zero_division='raise', 316 | centered_already=False): 317 | """ 318 | Computes the centered alignment between two kernel matrices 319 | 320 | (Alignment is computed on centered kernel matrices) 321 | 322 | Implements Definition 4 (Kernel matrix alignment) from Section 2.3 in Cortes, 323 | Corinna, Mehryar Mohri, and Afshin Rostamizadeh, 2012, "Algorithms for 324 | Learning Kernels Based on Centered Alignment", Journal of Machine Learning 325 | Research 13(Mar): 795–828. 326 | 327 | Parameters 328 | ---------- 329 | 330 | km_one, km_two : KernelMatrix 331 | 332 | value_if_zero_division : str or float 333 | determines the value of alignment, in case the norm of one of the two 334 | kernel matrices is close to zero and we are unable to compute it. 335 | 336 | Default is 'raise', requesting to raise an exception. 337 | 338 | One could also choose 0.0, which assigns lowest alignment, effectively 339 | discarding it for ranking purposes. 340 | 341 | centered_already : bool 342 | Flag to indicate whether the input kernel matrices are centered already 343 | or not. If False, input KMs will be centered. 344 | 345 | Returns 346 | ------- 347 | centered_alignment : float 348 | Value of centered_alignment between the two kernel matrices 349 | 350 | """ 351 | 352 | if km_one.shape != km_two.shape: 353 | raise ValueError('Dimensions of the two matrices must be the same ' 354 | 'to compute their alignment! They differ: {}, {}' 355 | ''.format(km_one.shape, km_two.shape)) 356 | 357 | if not isinstance(km_one, np.ndarray) or not isinstance(km_two, np.ndarray): 358 | raise TypeError('Input KMs must be numpy arrays') 359 | 360 | if not centered_already: 361 | kC_one = center_km(km_one) 362 | kC_two = center_km(km_two) 363 | else: 364 | kC_one = km_one 365 | kC_two = km_two 366 | 367 | fnorm_one = frobenius_norm(kC_one) 368 | fnorm_two = frobenius_norm(kC_two) 369 | 370 | if np.isclose(fnorm_one, 0.0) or np.isclose(fnorm_two, 0.0): 371 | if value_if_zero_division in ('raise', Exception): 372 | raise ValueError('The Frobenius norm of KM1 or KM2 is 0. ' 373 | 'Can not compute alignment!') 374 | else: 375 | warn('The Frobenius norm of KM1 or KM2 is 0. Setting value of ' 376 | 'alignment as {} as requested'.format( 377 | value_if_zero_division)) 378 | return value_if_zero_division 379 | 380 | return frobenius_product(kC_one, kC_two) / (fnorm_one * fnorm_two) 381 | 382 | 383 | def eval_similarity(km_one, km_two): 384 | """Evaluate similarity between two kernel matrices""" 385 | 386 | raise NotImplementedError() 387 | 388 | 389 | def linear_combination(km_set, weights, norm_weights=False): 390 | """ 391 | Weighted linear combinations of a set of given kernel matrices 392 | 393 | Parameters 394 | ---------- 395 | km_set : KernelSet 396 | Collection of compatible kernel matrices 397 | 398 | weights : Iterable 399 | Set of weights for the kernel matrices in km_set. 400 | Weights are not checked to sum to 1.0. Use norm_weights=True if needed. 401 | 402 | norm_weights : bool 403 | Flag to request normalizing weights to ensure they sum to 1.0 404 | 405 | Returns 406 | ------- 407 | lin_comb_KM : ndarray 408 | Final result of weighted linear combination of the kernel matrix set 409 | 410 | """ 411 | 412 | if km_set.size == len(weights): 413 | weights = ensure_ndarray_1D(weights) 414 | else: 415 | raise ValueError('Number of weights ({}) supplied differ ' 416 | 'from the kernel set size ({})' 417 | ''.format(km_set.size, len(weights))) 418 | 419 | if norm_weights: 420 | denom = weights.sum() 421 | if np.isclose(denom, 0.0): 422 | raise RuntimeError('sum of weights == 0.0, unable to normalize!') 423 | weights = weights / denom 424 | 425 | # Computes the weighted average kernel 426 | # km_set.num_samples is a tuple (N, M) when operating on two samples 427 | # e.g. train x test 428 | KM = np.zeros(km_set.num_samples) 429 | for weight, km in zip(weights, km_set): 430 | KM = KM + weight * km.full 431 | 432 | return KM 433 | -------------------------------------------------------------------------------- /kernelmethods/ranking.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Module gathering techniques and helpers to rank kernels using various methods and 4 | metrics, such as 5 | 6 | - their target alignment, 7 | - performance in cross-validation 8 | 9 | """ 10 | 11 | import numpy as np 12 | from kernelmethods import config as cfg 13 | from kernelmethods.sampling import KernelBucket 14 | from kernelmethods.utils import min_max_scale 15 | 16 | 17 | def find_optimal_kernel(kernel_bucket, sample, targets, method='align/corr', 18 | **method_params): 19 | """ 20 | Finds the optimal kernel for the current sample given their labels. 21 | 22 | Parameters 23 | ---------- 24 | kernel_bucket : KernelBucket 25 | The collection of kernels to evaluate and rank 26 | 27 | sample : ndarray 28 | The dataset given kernel bucket to be evaluated on 29 | 30 | targets : ndarray 31 | Target labels for each point in the sample dataset 32 | 33 | method : str 34 | identifier for the metric to choose to rank the kernels 35 | 36 | Returns 37 | ------- 38 | km : KernelMatrix 39 | Instance of KernelMatrix with the optimal kernel function 40 | 41 | """ 42 | 43 | if not isinstance(kernel_bucket, KernelBucket): 44 | raise TypeError('Input is not of required type: KernelBucket') 45 | 46 | method = method.lower() 47 | if method not in cfg.VALID_RANKING_METHODS: 48 | raise NotImplementedError('Ranking method not recognized. Choose one of {}' 49 | ''.format(cfg.VALID_RANKING_METHODS)) 50 | 51 | kernel_bucket.attach_to(sample=sample) 52 | metric = rank_kernels(kernel_bucket, targets, method=method, **method_params) 53 | 54 | return kernel_bucket[np.argmax(metric)] 55 | 56 | 57 | def rank_kernels(kernel_bucket, targets, method='align/corr', **method_params): 58 | """ 59 | Computes a given ranking metric for all the kernel matrices in the bucket. 60 | 61 | Choices for the method include: "align/corr", "cv_risk" 62 | 63 | Parameters 64 | ---------- 65 | kernel_bucket : KernelBucket 66 | 67 | targets : Iterable 68 | target values of the sample attached to the bucket 69 | 70 | method : str 71 | Identifies one of the metrics: ``align/corr``, ``cv_risk`` 72 | 73 | method_params : dict 74 | Additional parameters to be passed on to the method chosen above. 75 | 76 | Returns 77 | ------- 78 | scores : ndarray 79 | Values of the ranking metrics computed for the kernel matrices in the bucket 80 | 81 | """ 82 | 83 | method = method.lower() 84 | if method not in cfg.VALID_RANKING_METHODS: 85 | raise NotImplementedError('Ranking method not recognized. Choose one of {}' 86 | ''.format(cfg.VALID_RANKING_METHODS)) 87 | 88 | if method in ("align/corr",): 89 | return alignment_ranking(kernel_bucket, targets, **method_params) 90 | elif method in ('cv_risk', 'cv'): 91 | return CV_ranking(kernel_bucket, targets, **method_params) 92 | 93 | 94 | def CV_ranking(kernel_bucket, targets, num_folds=3, estimator_name='SVM'): 95 | """ 96 | Ranks kernels by their performance measured via cross-validation (CV). 97 | 98 | Parameters 99 | ---------- 100 | kernel_bucket : KernelBucket 101 | 102 | targets : Iterable 103 | target values of the sample attached to the bucket 104 | 105 | num_folds : int 106 | Number of folds for the CV to be employed 107 | 108 | estimator_name : str 109 | Name of a valid Scikit-Learn estimator. Default: ``SVM`` 110 | 111 | Returns 112 | ------- 113 | scores : ndarray 114 | CV performance computed for the kernel matrices in the bucket 115 | 116 | """ 117 | 118 | from sklearn.model_selection import GridSearchCV 119 | 120 | cv_scores = list() 121 | for km in kernel_bucket: 122 | estimator, param_grid = get_estimator(estimator_name) 123 | gs = GridSearchCV(estimator=estimator, 124 | param_grid=param_grid, 125 | cv=num_folds) 126 | gs.fit(km.full, targets) 127 | cv_scores.append(gs.best_score_) 128 | 129 | # scaling helps compare across multiple metrics 130 | return 100 * min_max_scale(cv_scores) 131 | 132 | 133 | def alignment_ranking(kernel_bucket, targets, **method_params): 134 | """Method to rank kernels that depend on target alignment. 135 | 136 | .. note: 137 | 138 | To be implemented. 139 | 140 | """ 141 | 142 | raise NotImplementedError() 143 | 144 | 145 | def get_estimator(learner_id='svm'): 146 | """ 147 | Returns a valid kernel machine to become the base learner of the MKL methods. 148 | 149 | Base learner must be able to accept a precomputed kernel for fit/predict methods! 150 | 151 | Parameters 152 | ---------- 153 | learner_id : str 154 | Identifier for the estimator to be chosen. 155 | Options: ``SVM`` and ``SVR``. 156 | Default: ``SVM`` 157 | 158 | Returns 159 | ------- 160 | base_learner : Estimator 161 | An sklearn estimator 162 | 163 | param_grid : dict 164 | Parameter grid (sklearn format) for the chosen estimator. 165 | 166 | """ 167 | 168 | # TODO hyper-param optimization needs to be incorporated somewhere!! 169 | # Perhaps by returning a GridSearchCV(base_learner) object or similar? 170 | 171 | learner_id = learner_id.lower() 172 | if learner_id in ('svm', 'svc'): 173 | from sklearn.svm import SVC 174 | range_C = np.power(10.0, range(-6, 6)) 175 | param_grid = dict(C=range_C) 176 | base_learner = SVC(kernel='precomputed', probability=True, C=10) 177 | elif learner_id in ('svr',): 178 | from sklearn.svm import SVR 179 | range_C = np.power(10.0, range(-6, 6)) 180 | param_grid = dict(C=range_C) 181 | base_learner = SVR(kernel='precomputed', C=10) 182 | else: 183 | raise NotImplementedError('Requested base learner {} is not implemented yet!' 184 | ''.format(learner_id)) 185 | 186 | return base_learner, param_grid 187 | -------------------------------------------------------------------------------- /kernelmethods/sampling.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from warnings import warn 3 | 4 | import numpy as np 5 | from kernelmethods import config as cfg 6 | from kernelmethods.base import BaseKernelFunction, KernelMatrix, KernelSet 7 | from kernelmethods.config import KernelMethodsException, KernelMethodsWarning 8 | from kernelmethods.numeric_kernels import (GaussianKernel, LaplacianKernel, 9 | LinearKernel, PolyKernel, SigmoidKernel) 10 | from kernelmethods.operations import alignment_centered 11 | from kernelmethods.utils import is_iterable_but_not_str 12 | from scipy.stats.stats import pearsonr 13 | 14 | 15 | class KernelBucket(KernelSet): 16 | """ 17 | Class to generate and/or maintain a "bucket" of candidate kernels. 18 | 19 | Applications: 20 | 21 | 1. to rank/filter/select kernels based on a given sample via many metrics 22 | 2. to be defined. 23 | 24 | **Note**: 25 | 1. Linear kernel is always added during init without your choosing. 26 | 2. This is in contrast to Chi^2 kernel, which is not added to the bucket by 27 | default, as it requires positive feature values and may break default use for 28 | common applications. You can easily add Chi^2 or any other kernels via the 29 | ``add_parametrized_kernels`` method. 30 | 31 | 32 | Parameters 33 | ---------- 34 | poly_degree_values : Iterable 35 | List of values for the degree parameter of the PolyKernel. One 36 | KernelMatrix will be added to the bucket for each value. 37 | 38 | rbf_sigma_values : Iterable 39 | List of values for the sigma parameter of the GaussianKernel. One 40 | KernelMatrix will be added to the bucket for each value. 41 | 42 | laplace_gamma_values : Iterable 43 | List of values for the gamma parameter of the LaplacianKernel. One 44 | KernelMatrix will be added to the bucket for each value. 45 | 46 | sigmoid_gamma_values : Iterable 47 | List of values for the gamma parameter of the SigmoidKernel. One 48 | KernelMatrix will be added to the bucket for each value. 49 | 50 | sigmoid_offset_values : Iterable 51 | List of values for the offset parameter of the SigmoidKernel. One 52 | KernelMatrix will be added to the bucket for each value. 53 | 54 | name : str 55 | String to identify the purpose or type of the bucket of kernels. 56 | Also helps easily distinguishing it from other buckets. 57 | 58 | normalize_kernels : bool 59 | Flag to indicate whether the kernel matrices need to be normalized 60 | 61 | skip_input_checks : bool 62 | Flag to indicate whether checks on input data (type, format etc) can 63 | be skipped. This helps save a tiny bit of runtime for expert uses when 64 | data types and formats are managed thoroughly in numpy. Default: 65 | False. Disable this only when you know exactly what you're doing! 66 | 67 | """ 68 | 69 | 70 | def __init__(self, 71 | poly_degree_values=cfg.default_degree_values_poly_kernel, 72 | rbf_sigma_values=cfg.default_sigma_values_gaussian_kernel, 73 | laplace_gamma_values=cfg.default_gamma_values_laplacian_kernel, 74 | sigmoid_gamma_values=cfg.default_gamma_values_sigmoid_kernel, 75 | sigmoid_offset_values=cfg.default_offset_values_sigmoid_kernel, 76 | name='KernelBucket', 77 | normalize_kernels=True, 78 | skip_input_checks=False, 79 | ): 80 | """ 81 | Constructor. 82 | 83 | Parameters 84 | ---------- 85 | poly_degree_values : Iterable 86 | List of values for the degree parameter of the PolyKernel. One 87 | KernelMatrix will be added to the bucket for each value. 88 | 89 | rbf_sigma_values : Iterable 90 | List of values for the sigma parameter of the GaussianKernel. One 91 | KernelMatrix will be added to the bucket for each value. 92 | 93 | laplace_gamma_values : Iterable 94 | List of values for the gamma parameter of the LaplacianKernel. One 95 | KernelMatrix will be added to the bucket for each value. 96 | 97 | sigmoid_gamma_values : Iterable 98 | List of values for the gamma parameter of the SigmoidKernel. One 99 | KernelMatrix will be added to the bucket for each value. 100 | 101 | sigmoid_offset_values : Iterable 102 | List of values for the offset parameter of the SigmoidKernel. One 103 | KernelMatrix will be added to the bucket for each value. 104 | 105 | name : str 106 | String to identify the purpose or type of the bucket of kernels. 107 | Also helps easily distinguishing it from other buckets. 108 | 109 | normalize_kernels : bool 110 | Flag to indicate whether the kernel matrices need to be normalized 111 | 112 | skip_input_checks : bool 113 | Flag to indicate whether checks on input data (type, format etc) can 114 | be skipped. This helps save a tiny bit of runtime for expert uses when 115 | data types and formats are managed thoroughly in numpy. Default: 116 | False. Disable this only when you know exactly what you're doing! 117 | 118 | """ 119 | 120 | if isinstance(normalize_kernels, bool): 121 | self._norm_kernels = normalize_kernels 122 | else: 123 | raise TypeError('normalize_kernels must be bool') 124 | 125 | if isinstance(skip_input_checks, bool): 126 | self._skip_input_checks = skip_input_checks 127 | else: 128 | raise TypeError('skip_input_checks must be bool') 129 | 130 | # start with the addition of kernel matrix for linear kernel 131 | init_kset = [KernelMatrix(LinearKernel(), normalized=self._norm_kernels), ] 132 | super().__init__(km_list=init_kset, name=name) 133 | # not attached to a sample yet 134 | self._num_samples = None 135 | 136 | self.add_parametrized_kernels(PolyKernel, 'degree', poly_degree_values) 137 | self.add_parametrized_kernels(GaussianKernel, 'sigma', rbf_sigma_values) 138 | self.add_parametrized_kernels(LaplacianKernel, 'gamma', laplace_gamma_values) 139 | self.add_parametrized_kernels(SigmoidKernel, 'gamma', sigmoid_gamma_values) 140 | self.add_parametrized_kernels(SigmoidKernel, 'offset', sigmoid_offset_values) 141 | 142 | 143 | def add_parametrized_kernels(self, kernel_func, param, values): 144 | """ 145 | Adds a list of kernels parametrized by various values for a given param 146 | 147 | Parameters 148 | ---------- 149 | kernel_func : BaseKernelFunction 150 | Kernel function to be added (not an instance, but callable class) 151 | 152 | param : str 153 | Name of the parameter to the above kernel function 154 | 155 | values : Iterable 156 | List of parameter values. One kernel will be added for each value 157 | 158 | """ 159 | 160 | if (not isinstance(kernel_func, type)) or \ 161 | (not issubclass(kernel_func, BaseKernelFunction)): 162 | raise KernelMethodsException('Input {} is not a valid kernel func!' 163 | ' Must be derived from BaseKernelFunction' 164 | ''.format(kernel_func)) 165 | 166 | if values is None: 167 | # warn('No values provided for {}. Doing nothing!'.format(param)) 168 | return 169 | 170 | if not is_iterable_but_not_str(values, min_length=1): 171 | raise ValueError('values must be an iterable set of param values (n>=1)') 172 | 173 | for val in values: 174 | try: 175 | param_dict = {param : val, 176 | 'skip_input_checks': self._skip_input_checks} 177 | self.append(KernelMatrix(kernel_func(**param_dict), 178 | normalized=self._norm_kernels)) 179 | except: 180 | warn('Unable to add {} to the bucket for {}={}. Skipping it.' 181 | ''.format(kernel_func, param, val), KernelMethodsWarning) 182 | 183 | 184 | def make_kernel_bucket(strategy='exhaustive', 185 | normalize_kernels=True, 186 | skip_input_checks=False): 187 | """ 188 | Generates a candidate kernels based on user preferences. 189 | 190 | Parameters 191 | ---------- 192 | strategy : str 193 | Name of the strategy for populating the kernel bucket. 194 | Options: 'exhaustive' and 'light'. Default: 'exhaustive' 195 | 196 | normalize_kernels : bool 197 | Flag to indicate whether to normalize the kernel matrices 198 | 199 | skip_input_checks : bool 200 | Flag to indicate whether checks on input data (type, format etc) can 201 | be skipped. This helps save a tiny bit of runtime for expert uses when 202 | data types and formats are managed thoroughly in numpy. Default: 203 | False. Disable this only when you know exactly what you're doing! 204 | 205 | Returns 206 | ------- 207 | kb : KernelBucket 208 | Kernel bucket populated according to the requested strategy 209 | 210 | """ 211 | 212 | if isinstance(strategy, (KernelBucket, KernelSet)): 213 | import warnings 214 | warnings.warn('Input is already a kernel bucket/set - simply returning it!') 215 | return strategy 216 | 217 | strategy = strategy.lower() 218 | if strategy == 'exhaustive': 219 | return KernelBucket(name='KBucketExhaustive', 220 | normalize_kernels=normalize_kernels, 221 | skip_input_checks=skip_input_checks, 222 | poly_degree_values=cfg.default_degree_values_poly_kernel, 223 | rbf_sigma_values=cfg.default_sigma_values_gaussian_kernel, 224 | laplace_gamma_values=cfg.default_gamma_values_laplacian_kernel, 225 | sigmoid_gamma_values=cfg.default_gamma_values_sigmoid_kernel, 226 | sigmoid_offset_values=cfg.default_offset_values_sigmoid_kernel) 227 | elif strategy == 'light': 228 | return KernelBucket(name='KBucketLight', 229 | normalize_kernels=normalize_kernels, 230 | skip_input_checks=skip_input_checks, 231 | poly_degree_values=cfg.light_degree_values_poly_kernel, 232 | rbf_sigma_values=cfg.light_sigma_values_gaussian_kernel, 233 | laplace_gamma_values=cfg.light_gamma_values_laplacian_kernel, 234 | sigmoid_gamma_values=cfg.light_gamma_values_sigmoid_kernel, 235 | sigmoid_offset_values=cfg.light_offset_values_sigmoid_kernel) 236 | elif strategy == 'linear_only': 237 | return KernelBucket(name='KBucketLight', 238 | normalize_kernels=normalize_kernels, 239 | skip_input_checks=skip_input_checks, 240 | poly_degree_values=None, 241 | rbf_sigma_values=None, 242 | laplace_gamma_values=None, 243 | sigmoid_gamma_values=None, 244 | sigmoid_offset_values=None) 245 | else: 246 | raise ValueError('Invalid choice of strategy ' 247 | '- must be one of {}'.format(cfg.kernel_bucket_strategies)) 248 | 249 | 250 | def ideal_kernel(targets): 251 | """ 252 | Computes the kernel matrix from the given target labels. 253 | 254 | Parameters 255 | ---------- 256 | targets : Iterable 257 | Target values (``y``) to compute the ideal kernel from. 258 | 259 | Returns 260 | ------- 261 | ideal_kernel : ndarray 262 | The ideal kernel from (``yy\ :sup:`T` ``) 263 | 264 | """ 265 | 266 | targets = np.array(targets).reshape((-1, 1)) # row vector 267 | 268 | return targets.dot(targets.T) 269 | 270 | 271 | def correlation_km(k1, k2): 272 | """ 273 | Computes [pearson] correlation coefficient between two kernel matrices 274 | 275 | Parameters 276 | ---------- 277 | k1, k2 : ndarray 278 | Two kernel matrices of the same size 279 | 280 | Returns 281 | ------- 282 | corr_coef : float 283 | Correlation coefficient between the vectorized kernel matrices 284 | 285 | """ 286 | 287 | corr_coef, p_val = pearsonr(k1.ravel(), k2.ravel()) 288 | 289 | return corr_coef 290 | 291 | 292 | def pairwise_similarity(k_bucket, metric='corr'): 293 | """ 294 | Computes the similarity between all pairs of kernel matrices in a given bucket. 295 | 296 | Parameters 297 | ---------- 298 | k_bucket : KernelBucket 299 | Container of length num_km, with each an instance ``KernelMatrix`` 300 | 301 | metric : str 302 | Identifies the metric to be used. Options: ``corr`` (correlation 303 | coefficient) and ``align`` (centered alignment). 304 | 305 | Returns 306 | ------- 307 | pairwise_metric : ndarray of shape (num_km, num_km) 308 | A symmetric matrix computing the pairwise similarity between the various 309 | kernel matrices 310 | 311 | """ 312 | 313 | # mutual info? 314 | metric_func = {'corr' : correlation_km, 315 | 'align': partial(alignment_centered, value_if_zero_division=0.0)} 316 | 317 | num_kernels = k_bucket.size 318 | estimator = metric_func[metric] 319 | pairwise_metric = np.full((k_bucket.size, k_bucket.size), fill_value=np.nan) 320 | for idx_one in range(num_kernels): 321 | # kernel matrix is symmetric 322 | for idx_two in range(idx_one, num_kernels): # computing i,i as well to be consistent 323 | pairwise_metric[idx_one, idx_two] = estimator(k_bucket[idx_one].full, 324 | k_bucket[idx_two].full) 325 | 326 | # not computing diagonal entries (can also be set to 1 for some metrics) 327 | 328 | # making it symmetric 329 | idx_lower_tri = np.tril_indices(num_kernels) 330 | pairwise_metric[idx_lower_tri] = pairwise_metric.T[idx_lower_tri] 331 | 332 | return pairwise_metric 333 | -------------------------------------------------------------------------------- /kernelmethods/tests/test_algorithms.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import numpy as np 4 | from pytest import raises 5 | from sklearn.datasets import make_classification 6 | from sklearn.utils.estimator_checks import check_estimator 7 | 8 | from kernelmethods.algorithms import (KernelMachine, KernelMachineRegressor, 9 | OptimalKernelSVC, OptimalKernelSVR) 10 | from kernelmethods.config import (Chi2NegativeValuesException, KMNormError, 11 | KernelMethodsException, KernelMethodsWarning) 12 | from kernelmethods.numeric_kernels import DEFINED_KERNEL_FUNCS 13 | from kernelmethods.sampling import make_kernel_bucket 14 | 15 | warnings.simplefilter('ignore') 16 | 17 | rnd = np.random.RandomState(0) 18 | np.set_printoptions(precision=3, linewidth=120) 19 | 20 | sample_dim = 5 21 | n_training = 100 22 | n_testing = 30 23 | 24 | all_warns = set() 25 | warn_line = '{dashes} IGNORED WARNING {dashes}'.format(dashes='-' * 15) 26 | dash_line = '-' * 50 27 | 28 | 29 | def gen_random_sample(num_samples, sample_dim): 30 | """To better control precision and type of floats""" 31 | 32 | # TODO input sparse arrays for test 33 | return np.random.rand(num_samples, sample_dim) 34 | 35 | 36 | def warn_dev(msg): 37 | if msg not in all_warns: 38 | print('\n\n{}\n {}\n{}\n'.format(warn_line, msg, dash_line)) 39 | all_warns.add(msg) 40 | 41 | 42 | def _test_estimator_can_fit_predict(estimator, est_name=None): 43 | # fresh data for each call 44 | train_data, labels = make_classification(n_features=sample_dim, 45 | n_samples=n_training) 46 | test_data = gen_random_sample(n_testing, sample_dim) 47 | 48 | if hasattr(estimator, 'k_func') and 'chi2' in estimator.k_func.name: 49 | train_data = np.abs(train_data) 50 | test_data = np.abs(test_data) 51 | 52 | if est_name is None: 53 | est_name = str(estimator.__class__) 54 | 55 | try: 56 | check_estimator(estimator) 57 | except (KMNormError, Chi2NegativeValuesException, 58 | KernelMethodsException, KernelMethodsWarning, 59 | RuntimeError) as kme: 60 | warn_dev('KernelMethodsException encountered during estimator checks - ' 61 | 'ignoring it!\n Estimator: {}'.format(est_name)) 62 | # traceback.print_exc() 63 | # pass 64 | except Exception as exc: 65 | exc_msg = str(exc) 66 | # Given unresolved issues with sklearn estimator checks, not enforcing them! 67 | if '__dict__' in exc_msg: 68 | warn_dev('Ignoring the sklearn __dict__ check') 69 | pass 70 | elif 'not greater than' in exc_msg: 71 | warn_dev('Ignoring accuracy check from sklearn') 72 | elif "the number of features at training time" in exc_msg: 73 | if 'OptimalKernel' in est_name: 74 | warn_dev('Ignoring shape mismatch between train and test for ' 75 | 'OptimalKernel estimators (need for two-sample KM product)') 76 | else: 77 | raise exc 78 | # raise TypeError('atypical failed check for {}\nMessage: {}\n' 79 | # ''.format(est_name, exc_msg)) 80 | 81 | # try: 82 | # with warnings.catch_warnings(): 83 | # warnings.simplefilter("ignore") 84 | # estimator.fit(train_data, labels) 85 | # except: 86 | # raise RuntimeError('{} is unable to fit to training data!'.format( 87 | # est_name)) 88 | # 89 | # try: 90 | # estimator.predict(test_data) 91 | # except: 92 | # raise RuntimeError('{} is unable to make predictions'.format(est_name)) 93 | 94 | 95 | def test_optimal_kernel_estimators(): 96 | train_data, labels = make_classification(n_features=sample_dim, n_classes=2, 97 | n_samples=n_training) 98 | test_data = gen_random_sample(n_testing, sample_dim) 99 | 100 | # creating the smallest bucket, just with linear kernel, to speed up tests 101 | kb = make_kernel_bucket(strategy='linear_only') 102 | 103 | for OKEstimator in (OptimalKernelSVC, OptimalKernelSVR,): 104 | 105 | try: 106 | ok_est = OKEstimator(k_bucket=kb) 107 | except: 108 | raise RuntimeError('Unable to instantiate OptimalKernelSVR!') 109 | 110 | # disabling sklearn checks to avoid headaches with their internal checks 111 | _test_estimator_can_fit_predict(ok_est) 112 | 113 | for invalid_value in (np.random.randint(10), 10.1, ('tuple')): 114 | with raises(ValueError): 115 | ok_est = OKEstimator(k_bucket=invalid_value) 116 | ok_est.fit(train_data, labels) 117 | 118 | ok_est = OKEstimator(k_bucket=kb) 119 | ok_est.set_params(k_bucket=kb) 120 | 121 | 122 | def test_kernel_machine(): 123 | for ker_func in DEFINED_KERNEL_FUNCS: 124 | for ker_machine in (KernelMachine, KernelMachineRegressor): 125 | # print('\n\nTesting {}'.format(kernel)) 126 | try: 127 | k_machine = ker_machine(ker_func) 128 | except: 129 | raise RuntimeError('Unable to instantiate KernelMachine ' 130 | 'with this this ker func {}!'.format(ker_func)) 131 | 132 | # print('\n{}'.format(k_machine)) 133 | try: 134 | _test_estimator_can_fit_predict( 135 | k_machine, 'kernel machine with ' + str(ker_func)) 136 | except Exception as exc: 137 | raise 138 | 139 | -------------------------------------------------------------------------------- /kernelmethods/tests/test_base_classes.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from pytest import raises 4 | 5 | from kernelmethods.base import (AverageKernel, BaseKernelFunction, CompositeKernel, 6 | KernelFromCallable, KernelMatrix, 7 | KernelMatrixPrecomputed, ProductKernel, 8 | SumKernel, WeightedAverageKernel) 9 | from kernelmethods.config import KMAccessError 10 | from kernelmethods.numeric_kernels import (GaussianKernel, LaplacianKernel, 11 | LinearKernel, PolyKernel) 12 | from kernelmethods.sampling import make_kernel_bucket 13 | from kernelmethods.tests.test_numeric_kernels import _test_for_all_kernels 14 | 15 | default_feature_dim = 10 16 | range_feature_dim = [10, 500] 17 | range_num_samples = [50, 500] 18 | num_samples = np.random.randint(20) 19 | sample_dim = np.random.randint(10) 20 | range_polynomial_degree = [2, 10] # degree=1 is tested in LinearKernel() 21 | 22 | np.random.seed(42) 23 | 24 | # choosing skip_input_checks=False will speed up test runs 25 | # default values for parameters 26 | SupportedKernels = (GaussianKernel(), PolyKernel(), LinearKernel(), 27 | LaplacianKernel()) 28 | num_tests_psd_kernel = 3 29 | 30 | def gen_random_array(dim): 31 | """To better control precision and type of floats""" 32 | 33 | # TODO input sparse arrays for test 34 | return np.random.rand(dim) 35 | 36 | def gen_random_sample(num_samples, sample_dim): 37 | """To better control precision and type of floats""" 38 | 39 | # TODO input sparse arrays for test 40 | return np.random.rand(num_samples, sample_dim) 41 | 42 | km_lin = KernelMatrix(kernel=LinearKernel()) 43 | km_lin.attach_to(gen_random_sample(num_samples, sample_dim)) 44 | 45 | def simple_callable(x, y): 46 | return np.dot(x, y) 47 | 48 | def test_kernel_from_callable(): 49 | 50 | kf = KernelFromCallable(simple_callable) 51 | if not isinstance(kf, BaseKernelFunction): 52 | raise TypeError('Error in implementation of KernelFromCallable') 53 | 54 | _test_for_all_kernels(kf, 5) 55 | 56 | 57 | def test_KernelMatrix_design(): 58 | 59 | with raises(TypeError): 60 | km = KernelMatrix(kernel=simple_callable) 61 | 62 | with raises(TypeError): 63 | km = KernelMatrix(kernel=LinearKernel, normalized='True') 64 | 65 | assert len(km_lin) == num_samples**2 66 | 67 | colon_access = km_lin[:,:] 68 | if colon_access.size != km_lin.size: 69 | raise ValueError('error in getitem implementation when using [:, :]') 70 | 71 | _ = km_lin[1, :] 72 | _ = km_lin[:, 1] 73 | for invalid_index in (-1, np.Inf, np.NaN): 74 | with raises(KMAccessError): 75 | _ = km_lin[:, invalid_index] 76 | 77 | 78 | def test_centering(): 79 | 80 | km = KernelMatrix(kernel=LinearKernel()) 81 | km.attach_to(gen_random_sample(num_samples, sample_dim)) 82 | km.center() 83 | 84 | 85 | def test_normalize(): 86 | 87 | km = KernelMatrix(kernel=LinearKernel()) 88 | km.attach_to(gen_random_sample(num_samples, sample_dim)) 89 | km.normalize() 90 | 91 | 92 | def test_KM_results_in_NaN_Inf(): 93 | """""" 94 | pass 95 | 96 | 97 | def test_km_precomputed(): 98 | 99 | rand_size = np.random.randint(5, 50) 100 | rand_matrix = np.random.rand(rand_size, rand_size) 101 | # making symmetric 102 | rand_matrix = rand_matrix + rand_matrix.T 103 | pre = KernelMatrixPrecomputed(rand_matrix, name='rand') 104 | 105 | assert pre.size == rand_size == len(pre) 106 | assert np.isclose(pre.full, rand_matrix).all() 107 | assert np.isclose(pre.diag, rand_matrix.diagonal()).all() 108 | # __getitem__ 109 | for _ in range(min(5, rand_size)): 110 | indices = np.random.randint(0, rand_size, 2) 111 | assert pre[indices[0], indices[1]] == rand_matrix[indices[0], indices[1]] 112 | 113 | with raises(ValueError): # not symmtric 114 | pre = KernelMatrixPrecomputed(np.random.rand(rand_size, rand_size+1)) 115 | 116 | with raises(ValueError): 117 | pre = KernelMatrixPrecomputed([[1, 2], [2, 3, 4, 9]]) 118 | 119 | # 3D or 1D 120 | with raises(ValueError): 121 | pre = KernelMatrixPrecomputed(np.random.rand(rand_size, rand_size, 2)) 122 | 123 | with raises(ValueError): 124 | pre = KernelMatrixPrecomputed(np.random.rand(rand_size)) 125 | 126 | # must have real values 127 | with raises(ValueError): 128 | pre = KernelMatrixPrecomputed([[1, 2+4j], [9+2j, 3]]) 129 | 130 | with raises(KMAccessError): 131 | _= pre[np.Inf, 0] 132 | 133 | 134 | def test_composite_kernels(): 135 | 136 | kset = make_kernel_bucket() 137 | kset.attach_to(gen_random_sample(num_samples, sample_dim)) 138 | 139 | for ck in (AverageKernel, SumKernel, WeightedAverageKernel, ProductKernel): 140 | 141 | if issubclass(ck, WeightedAverageKernel): 142 | result_km = ck(kset, np.random.rand(kset.size)) 143 | else: 144 | result_km = ck(kset) 145 | 146 | if not isinstance(result_km, CompositeKernel): 147 | raise TypeError(' Composite kernel {} not defined properly: ' 148 | 'it must be a child of {}' 149 | ''.format(result_km, CompositeKernel)) 150 | 151 | result_km.fit() 152 | 153 | reqd_attrs = ('composite_KM', 'full') 154 | for reqd in reqd_attrs: 155 | if not hasattr(result_km, reqd): 156 | raise TypeError('{} does not have attr {}'.format(result_km, reqd)) 157 | -------------------------------------------------------------------------------- /kernelmethods/tests/test_categorical.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | import traceback 4 | from numbers import Number 5 | 6 | import numpy as np 7 | from hypothesis import (HealthCheck, given, settings as hyp_settings, strategies) 8 | from pytest import raises 9 | 10 | from kernelmethods.base import KernelMatrix 11 | from kernelmethods.categorical import MatchCountKernel 12 | from kernelmethods.config import dtype_categorical 13 | from kernelmethods.operations import is_positive_semidefinite 14 | from kernelmethods.utils import check_callable 15 | 16 | default_feature_dim = 10 17 | range_feature_dim = [10, 500] 18 | range_num_samples = [50, 500] 19 | range_string_length = [3, 25] 20 | 21 | np.random.seed(42) 22 | 23 | # choosing skip_input_checks=False will speed up test runs 24 | # default values for parameters 25 | SupportedKernels = (MatchCountKernel(),) 26 | num_tests_psd_kernel = 3 27 | 28 | 29 | def random_string(length=5): 30 | return ''.join(random.choices(string.ascii_letters, k=length)) 31 | 32 | 33 | def gen_random_categorical_array(dim, length): 34 | """To better control precision and type of floats""" 35 | 36 | return np.array([random_string(length) for _ in range(dim)], 37 | dtype=dtype_categorical) 38 | 39 | 40 | def gen_random_sample(num_samples, sample_dim, string_length): 41 | """To better control precision and type of floats""" 42 | 43 | return np.array([gen_random_categorical_array(sample_dim, string_length) for 44 | _ in range(num_samples)]) 45 | 46 | 47 | def _test_for_all_kernels(kernel, sample_dim, string_length): 48 | """Common tests that all kernels must pass.""" 49 | 50 | x = gen_random_categorical_array(sample_dim, string_length) 51 | y = gen_random_categorical_array(sample_dim, string_length) 52 | 53 | try: 54 | result = kernel(x, y) 55 | except Exception: 56 | traceback.print_exc() 57 | raise RuntimeError('{} unable to calculate!\n' 58 | ' on x {}\n y{}'.format(kernel, x, y)) 59 | 60 | if not isinstance(result, Number): 61 | raise ValueError('result {} of type {} is not a number!\n' 62 | 'x={}\ny={}\nkernel={}\n' 63 | ''.format(result, type(result), x, y, kernel)) 64 | 65 | if kernel(y, x) != result: 66 | raise ValueError('{} is not symmetric!' 67 | 'x={}\n y={}\n kernel={}\n'.format(kernel.name, x, y, 68 | kernel)) 69 | 70 | 71 | def test_kernel_design(): 72 | """ 73 | Every kernel must be 74 | 1. must have a name defined 75 | 2. must be callable with two samples 76 | 3. returns a number 77 | 78 | """ 79 | 80 | for kernel in SupportedKernels: 81 | 82 | # must be callable with 2 args 83 | check_callable(kernel, min_num_args=2) 84 | 85 | if not hasattr(kernel, 'name'): 86 | raise TypeError('{} does not have name attribute!'.format(kernel)) 87 | 88 | # only numeric data is accepted and other dtypes must raise an error 89 | for non_catg in [(True, False, True), 90 | [1.0, 2.4], 91 | [object, object]]: 92 | with raises(TypeError): 93 | _ = kernel(non_catg, non_catg) 94 | 95 | 96 | def _test_func_is_valid_kernel(kernel, sample_dim, num_samples, string_length): 97 | """A func is a valid kernel if the kernel matrix generated by it is PSD. 98 | 99 | Not including this in tests for all kernels to allow for non-PSD kernels in 100 | the future 101 | 102 | """ 103 | 104 | KM = KernelMatrix(kernel, name='TestKM') 105 | KM.attach_to(gen_random_sample(num_samples, sample_dim, string_length)) 106 | is_psd = is_positive_semidefinite(KM.full, verbose=True) 107 | if not is_psd: 108 | raise ValueError('{} is not PSD'.format(str(KM))) 109 | 110 | 111 | @hyp_settings(max_examples=num_tests_psd_kernel, deadline=None, 112 | suppress_health_check=HealthCheck.all()) 113 | @given(strategies.integers(range_feature_dim[0], range_feature_dim[1]), 114 | strategies.integers(range_num_samples[0], range_num_samples[1]), 115 | strategies.integers(range_string_length[0], range_string_length[1]), 116 | strategies.booleans()) 117 | def test_match_count_kernel(sample_dim, num_samples, string_length, perc_flag): 118 | """Tests specific for Polynomial kernel.""" 119 | 120 | poly = MatchCountKernel(return_perc=perc_flag, skip_input_checks=False) 121 | _test_for_all_kernels(poly, sample_dim, string_length) 122 | _test_func_is_valid_kernel(poly, sample_dim, num_samples, string_length) 123 | -------------------------------------------------------------------------------- /kernelmethods/tests/test_kernel_matrix.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | np.set_printoptions(linewidth=120, precision=4) 5 | from scipy.sparse import issparse 6 | from scipy.linalg import eigh 7 | from pytest import raises 8 | from kernelmethods.numeric_kernels import PolyKernel, GaussianKernel, LinearKernel, \ 9 | DEFINED_KERNEL_FUNCS 10 | from kernelmethods import KernelMatrix, KMAccessError, KernelMethodsException 11 | from kernelmethods.base import ConstantKernelMatrix 12 | from kernelmethods.operations import is_PSD 13 | 14 | num_samples = np.random.randint(30, 100) 15 | sample_dim = np.random.randint(3, 10) # 2 16 | target_label_set = [1, 2] 17 | 18 | num_samples_two = np.random.randint(30, 100) 19 | sample_two_dim = sample_dim 20 | 21 | sample_data = np.random.rand(num_samples, sample_dim) 22 | target_labels = np.random.choice(target_label_set, num_samples) 23 | 24 | poly = PolyKernel(degree=2, skip_input_checks=True) 25 | # suffix 1 to indicate one sample case 26 | km1 = KernelMatrix(poly) 27 | km1.attach_to(sample_data) 28 | 29 | max_num_elements = max_num_ker_eval = num_samples * (num_samples + 1) / 2 30 | 31 | def test_symmetry(): 32 | 33 | if not np.isclose(km1.full, km1.full.T).all(): 34 | print('KM not symmetric') 35 | 36 | def test_PSD(): 37 | 38 | if not is_PSD(km1.full): 39 | raise ValueError('this kernel matrix is not PSD!') 40 | 41 | def test_normalization(): 42 | 43 | km1.normalize(method='cosine') 44 | if not hasattr(km1, 'normed_km'): 45 | raise ValueError('Attribute exposing normalized km does not exist!') 46 | 47 | if not np.isclose(km1.normed_km.diagonal(), 1.0).all(): 48 | raise ValueError('One or more diagonal elements of normalized KM != 1.0:\n\t' 49 | '{}'.format(km1.normed_km.diagonal())) 50 | 51 | km2 = KernelMatrix(poly) 52 | km2.attach_to(sample_data) 53 | normed_km = km2.normed_km 54 | assert normed_km.shape == km2.shape 55 | 56 | frob = km1.frob_norm 57 | assert np.isreal(frob) 58 | 59 | # during init 60 | with raises(TypeError): 61 | _ = KernelMatrix(poly, normalized='True') 62 | 63 | def test_centering(): 64 | 65 | km2 = KernelMatrix(poly) 66 | km2.attach_to(sample_data) 67 | assert km2.centered.shape == km2.shape 68 | 69 | def test_get_item(): 70 | 71 | for invalid_index in [-1, num_samples+1]: 72 | # out of range indices must raise an error on any dim 73 | with raises(KMAccessError): 74 | print(km1[invalid_index, :]) 75 | with raises(KMAccessError): 76 | print(km1[:, invalid_index]) 77 | 78 | # max 2 dims allowed for access 79 | # TODO no restriction on float: float indices will be rounded down towards 0 80 | # (1.0, 2), (1, 3.5) are valid at the moment 81 | for invalid_access in [(2 , 4, 5), (5,), 82 | ('1', 1), (2, 'efd'), 83 | ( ((0, 1), 2), (3, 4)), # no tuple of tuples for a single dim 84 | ]: 85 | with raises((KMAccessError, TypeError)): 86 | print(km1[invalid_access]) 87 | 88 | with raises(KMAccessError): 89 | km1[1, 2, 3] # no 3-dim access 90 | 91 | with raises(KMAccessError): 92 | km1[1, 2, 3, 4] # no 4-dim access either 93 | 94 | # selection must result in valid indices 95 | with raises(KMAccessError): 96 | km1[0,km1.size+5] 97 | 98 | with raises(KMAccessError): 99 | km1[km1.size + 5, 0] 100 | 101 | # linear indexing is now allowed 102 | for valid_index in np.random.randint(0, km1.size, 5): 103 | _ = km1[valid_index] 104 | 105 | # as well as vectorized/colon 106 | _ = km1[:,0] 107 | _ = km1[0, :] 108 | 109 | 110 | def test_random_submatrix_access(): 111 | 112 | # for trial in range(10): 113 | 114 | subset_len1 = np.random.choice(np.arange(num_samples - 1) + 1, 2) 115 | subset_len2 = np.random.choice(np.arange(num_samples - 1) + 1, 2) 116 | subset_len1.sort() 117 | subset_len2.sort() 118 | 119 | if subset_len1[0]==subset_len1[1]: 120 | subset_len1[1] = subset_len1[0] + 1 121 | 122 | if subset_len2[0]==subset_len2[1]: 123 | subset_len2[1] = subset_len2[0] + 1 124 | 125 | sub_matrix = km1[subset_len1[0]:subset_len1[1], subset_len2[0]:subset_len2[1]] 126 | if not sub_matrix.shape == (subset_len1[1]-subset_len1[0], 127 | subset_len2[1]-subset_len2[0]): 128 | raise ValueError('error in KM access implementation') 129 | 130 | def test_size_properties(): 131 | 132 | if len(km1.diagonal()) != num_samples: 133 | raise ValueError('KM diagonal does not have N elements!') 134 | 135 | if km1.size != num_samples**2: 136 | raise ValueError('KM size does not match N^2, N=num_samples') 137 | 138 | if km1.size != km1.num_samples**2: 139 | raise ValueError('KM size does not match N^2, invalid internal representation!') 140 | 141 | def test_sparsity(): 142 | 143 | km = KernelMatrix(poly, normalized=False) 144 | km.attach_to(sample_data) 145 | # when normalized=True, full KM won't be sparse! 146 | if not km._keep_normed and not issparse(km.full_sparse): 147 | raise TypeError('error in sparse format access of KM : it is not sparse') 148 | 149 | if issparse(km1.full): 150 | raise TypeError('error in dense format access of KM : it is sparse!') 151 | 152 | def test_reset_flags_on_new_attach(): 153 | 154 | km1.attach_to(sample_data) 155 | if km1._populated_fully: 156 | raise ValueError('flag _populated_fully not set to False upon reset') 157 | if km1._lower_tri_km_filled: 158 | raise ValueError('flag _lower_tri_km_filled not set to False upon reset') 159 | if km1._num_ker_eval > 0: 160 | raise ValueError('counter _num_ker_eval > 0 upon reset!') 161 | if hasattr(km1, '_full_km'): 162 | raise ValueError('_full_km from previous run is not cleared!') 163 | if len(km1._KM) > 0: 164 | raise ValueError('internal dict not empty upon reset!') 165 | 166 | def test_internal_flags_on_recompute(): 167 | 168 | km1.attach_to(sample_data) # reset first 169 | new_dense = km1.full # recompute 170 | if not km1._populated_fully: 171 | raise ValueError('flag _populated_fully not set to True upon recompute') 172 | if km1._num_ker_eval != max_num_ker_eval: 173 | raise ValueError('unexpected value for counter _num_ker_eval upon recompute!') 174 | if not hasattr(km1, '_full_km'): 175 | raise ValueError('_full_km is not populated yet!') 176 | if len(km1._KM)!=max_num_elements: 177 | raise ValueError('internal dict not empty upon recompute!') 178 | if not km1._lower_tri_km_filled: 179 | raise ValueError('flag _lower_tri_km_filled not set to True ' 180 | 'upon recompute with fill_lower_tri=True') 181 | 182 | def test_attach_to_two_samples(): 183 | """ 184 | Behaviour of KM when attached to two samples. 185 | 186 | 0. it is not necessarily symmetric 187 | 188 | """ 189 | 190 | sample_two = np.random.rand(num_samples_two, sample_two_dim) 191 | targets_two = np.random.choice(target_label_set, num_samples_two) 192 | 193 | for kernel in DEFINED_KERNEL_FUNCS: 194 | km2 = KernelMatrix(kernel=kernel, normalized=False) 195 | km2.attach_to(sample_data, name_one='S1', sample_two=sample_two, name_two='S2') 196 | km2_dense = km2.full # this will force computation of full KM 197 | 198 | rand_ix_one = np.random.choice(range(num_samples), 5) 199 | rand_ix_two = np.random.choice(range(num_samples_two), 5) 200 | for ix_one, ix_two in zip(rand_ix_one, rand_ix_two): 201 | external_eval = kernel(sample_data[ix_one,:], sample_two[ix_two,:]) 202 | if not np.isclose(km2[ix_one, ix_two], external_eval): 203 | raise ValueError('Invalid implementation in two sample case:' 204 | '\n\tcomputed values do not match external evaluation!' 205 | '\n\t for {}'.format(kernel)) 206 | 207 | if km2.size != num_samples*num_samples_two: 208 | raise ValueError('KM size does not match N1*N2, N=num_samples for dataset i') 209 | 210 | if km2.size != np.prod(km2.num_samples): 211 | raise ValueError('KM size does not match N1*N2, invalid internal representation!') 212 | 213 | with raises(NotImplementedError): 214 | km2.center() 215 | 216 | with raises(KMAccessError): 217 | km2.centered 218 | 219 | with raises((KMAccessError, NotImplementedError)): 220 | km2.diagonal() 221 | 222 | with raises(ValueError): 223 | # dimensionalities can not differ! 224 | more_dims = np.hstack((sample_data, sample_data[:,:1])) 225 | km2.attach_to(sample_data, sample_two=more_dims) 226 | 227 | 228 | def test_attributes(): 229 | 230 | km = KernelMatrix(LinearKernel()) 231 | km.set_attr('name', 'linear') 232 | assert km.get_attr('name') == 'linear' 233 | assert km.get_attr('noname', '404') == '404' 234 | km.set_attr('weight', 42) 235 | 236 | kma = km.attributes() 237 | for attr in ('name', 'weight'): 238 | assert attr in kma 239 | 240 | 241 | def test_constant_km(): 242 | 243 | rand_val = np.random.random() 244 | rand_size = np.random.randint(50) 245 | 246 | const = ConstantKernelMatrix(num_samples=rand_size, 247 | value=rand_val) 248 | # trying name param also 249 | const = ConstantKernelMatrix(num_samples=rand_size, 250 | value=rand_val, name=None) 251 | 252 | assert const.num_samples == rand_size == const.size 253 | assert len(const) == rand_size 254 | assert const.shape == (rand_size, rand_size) 255 | 256 | for _ in range(min(5, rand_size)): 257 | indices = np.random.randint(0, rand_size, 2) 258 | assert all(const[indices[0], indices[1]] == rand_val) 259 | 260 | for invalid_index in ('index', ':', 261 | [np.Inf, ], [ 1,-rand_size-2], 262 | [], [None, 2]): 263 | with raises(KMAccessError): 264 | const[invalid_index] 265 | 266 | # there must be a single unique value in the matrix or diagonal 267 | assert np.isclose(np.unique(const.full), rand_val).all() 268 | assert np.isclose(np.unique(const.diag), rand_val).all() 269 | 270 | expected = np.full((rand_size, rand_size), fill_value=rand_val) 271 | assert np.isclose(const.full, expected).all() 272 | 273 | 274 | # test_attributes() 275 | # test_constant_km() 276 | test_get_item() 277 | -------------------------------------------------------------------------------- /kernelmethods/tests/test_kernel_set.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from pytest import raises 4 | 5 | from kernelmethods.base import KMSetAdditionError, KernelMatrix, KernelSet, \ 6 | BaseKernelFunction 7 | from kernelmethods.numeric_kernels import GaussianKernel, LinearKernel, PolyKernel 8 | from kernelmethods.sampling import make_kernel_bucket 9 | 10 | num_samples = 50 # 9 11 | sample_dim = 3 # 2 12 | target_label_set = [1, 2] 13 | 14 | sample_data = np.random.rand(num_samples, sample_dim) 15 | target_labels = np.random.choice(target_label_set, (num_samples, 1)) 16 | 17 | IdealKM = target_labels.dot(target_labels.T) 18 | 19 | rbf = KernelMatrix(GaussianKernel(sigma=10, skip_input_checks=True)) 20 | lin = KernelMatrix(LinearKernel(skip_input_checks=True)) 21 | poly = KernelMatrix(PolyKernel(degree=2, skip_input_checks=True)) 22 | 23 | # lin.attach_to(sample_data) 24 | # rbf.attach_to(sample_data) 25 | # poly.attach_to(sample_data) 26 | 27 | kset = KernelSet([lin, poly, rbf]) 28 | print(kset) 29 | 30 | def test_creation(): 31 | 32 | try: 33 | ks = KernelSet() 34 | except: 35 | raise SyntaxError('empty set creation failed.') 36 | 37 | with raises(TypeError): 38 | ks = KernelSet(km_list='blah') 39 | 40 | def test_size_property_mismatch(): 41 | 42 | ks = KernelSet(num_samples=sample_data.shape[0]+1) 43 | lin = KernelMatrix(LinearKernel(skip_input_checks=True)) 44 | lin.attach_to(sample_data) 45 | with raises(KMSetAdditionError): 46 | ks.append(lin) 47 | 48 | 49 | def test_size(): 50 | 51 | assert kset.size == 3 52 | assert len(kset) == 3 53 | 54 | def test_get_item(): 55 | """access by index""" 56 | 57 | for invalid_index in [-1, kset.size]: 58 | with raises(IndexError): 59 | print(kset[invalid_index]) 60 | 61 | for invalid_index in [-1.0, '1']: 62 | with raises(ValueError): 63 | print(kset[invalid_index]) 64 | 65 | 66 | def test_get_ker_funcs(): 67 | 68 | for index in (0, 1): 69 | kf_list = kset.get_kernel_funcs([index, ]) 70 | for kf in kf_list: 71 | if not isinstance(kf, BaseKernelFunction): 72 | raise TypeError('get_kernel_funcs not returning proper output type') 73 | 74 | def test_take(): 75 | """access by index""" 76 | 77 | for invalid_index in [-1, kset.size]: 78 | with raises(IndexError): 79 | print(kset.take([invalid_index])) 80 | 81 | for valid_index in np.random.randint(0, min(kset.size, 3), 3): 82 | _ks = kset.take(valid_index) 83 | if not isinstance(_ks, KernelSet): 84 | raise TypeError('.take not returning KernelSet') 85 | for _km in _ks: 86 | if not isinstance(_km, KernelMatrix): 87 | raise TypeError('Elements of KernelSet are not KernelMatrix!') 88 | 89 | k2 = kset.take([0, 1]) 90 | assert isinstance(k2, KernelSet) 91 | assert k2.size == 2 92 | 93 | def test_extend(): 94 | 95 | kset1 = KernelSet([poly, rbf, lin]) 96 | kset2 = KernelSet([poly, rbf]) 97 | kset1.extend(kset2) 98 | 99 | if kset1.size != 5: 100 | raise ValueError('KernelSet.extend() failed') 101 | 102 | with raises(KMSetAdditionError): 103 | kset1.extend(['blah', ]) 104 | 105 | with raises(KMSetAdditionError): 106 | k4_diff_size = KernelSet(num_samples=kset.size+1) 107 | kset1.extend(k4_diff_size) 108 | 109 | 110 | def test_attributes(): 111 | 112 | kset.set_attr('name', 'linear') 113 | for km in kset: 114 | assert km.get_attr('name') == 'linear' 115 | assert km.get_attr('noname', '404') == '404' 116 | 117 | values = np.random.rand(kset.size) 118 | kset.set_attr('weight', values) 119 | for ii, km in enumerate(kset): 120 | assert km.get_attr('weight') == values[ii] 121 | 122 | kb = make_kernel_bucket() 123 | kb.attach_to(sample_data, attr_name='a', attr_value='b') 124 | # differing length 125 | with raises(ValueError): 126 | kb.set_attr('a', ['value']*(kb.size-1)) 127 | 128 | kb.get_attr('a') 129 | 130 | # 131 | # print('Alignment to Ideal Kernel:') 132 | # ag = np.zeros(kb.size) 133 | # for ix, km in enumerate(kb): 134 | # ag[ix] = alignment_centered(km.full, IdealKM) 135 | # print('{:4} {:>60} : {:10.5f}'.format(ix, str(km),ag[ix])) 136 | 137 | test_take() 138 | -------------------------------------------------------------------------------- /kernelmethods/tests/test_numeric_kernels.py: -------------------------------------------------------------------------------- 1 | 2 | from numbers import Number 3 | 4 | import numpy as np 5 | from hypothesis import (HealthCheck, given, settings as hyp_settings, strategies) 6 | from pytest import raises 7 | 8 | from kernelmethods.base import KernelMatrix 9 | from kernelmethods.numeric_kernels import (Chi2Kernel, DEFINED_KERNEL_FUNCS, 10 | GaussianKernel, LaplacianKernel, 11 | LinearKernel, PolyKernel, SigmoidKernel, 12 | HadamardKernel) 13 | from kernelmethods.operations import is_positive_semidefinite 14 | from kernelmethods.utils import check_callable 15 | 16 | default_feature_dim = 10 17 | range_feature_dim = [10, 50] 18 | range_num_samples = [50, 100] 19 | 20 | range_polynomial_degree = [2, 10] # degree=1 is tested in LinearKernel() 21 | 22 | np.random.seed(42) 23 | 24 | # choosing skip_input_checks=False will speed up test runs 25 | # default values for parameters 26 | 27 | num_tests_psd_kernel = 3 28 | 29 | def gen_random_array(dim): 30 | """To better control precision and type of floats""" 31 | 32 | # TODO input sparse arrays for test 33 | return np.random.rand(dim) 34 | 35 | def gen_random_sample(num_samples, sample_dim): 36 | """To better control precision and type of floats""" 37 | 38 | # TODO input sparse arrays for test 39 | return np.random.rand(num_samples, sample_dim) 40 | 41 | 42 | def _test_for_all_kernels(kernel, sample_dim, check_PSDness=True): 43 | """Common tests that all kernels must pass.""" 44 | 45 | x = gen_random_array(sample_dim) 46 | y = gen_random_array(sample_dim) 47 | 48 | try: 49 | result = kernel(x, y) 50 | except Exception: 51 | raise RuntimeError('{} unable to calculate!\n' 52 | ' on x {}\n y{}'.format(kernel, x, y)) 53 | 54 | if not isinstance(result, Number): 55 | raise ValueError('result {} of type {} is not a number!\n' 56 | 'x={}\ny={}\nkernel={}\n' 57 | ''.format(result, type(result), x, y, kernel)) 58 | 59 | if kernel(y, x) != result: 60 | raise ValueError('{} is not symmetric!' 61 | 'x={}\n y={}\n kernel={}\n' 62 | ''.format(kernel.name, x, y, kernel)) 63 | 64 | if check_PSDness: 65 | # ensuring it produces a PSD KM 66 | kernel.is_psd() 67 | 68 | 69 | def test_kernel_design(): 70 | """ 71 | Every kernel must be 72 | 1. must have a name defined 73 | 2. must be callable with two samples 74 | 3. returns a number 75 | 76 | """ 77 | 78 | for kernel in DEFINED_KERNEL_FUNCS: 79 | 80 | # must be callable with 2 args 81 | check_callable(kernel, min_num_args=2) 82 | 83 | if not hasattr(kernel, 'name'): 84 | raise TypeError('{} does not have name attribute!'.format(kernel)) 85 | 86 | # only numeric data is accepted and other dtypes must raise an error 87 | for non_num in ['string', 88 | [object, object] ]: 89 | with raises(ValueError): 90 | _ = kernel(non_num, non_num) 91 | 92 | 93 | def _test_func_is_valid_kernel(kernel, sample_dim, num_samples): 94 | """A func is a valid kernel if the kernel matrix generated by it is PSD. 95 | 96 | Not including this in tests for all kernels to allow for non-PSD kernels in the future 97 | 98 | """ 99 | 100 | KM = KernelMatrix(kernel, name='TestKM') 101 | KM.attach_to(gen_random_sample(num_samples, sample_dim)) 102 | is_psd = is_positive_semidefinite(KM.full, verbose=True) 103 | if not is_psd: 104 | raise ValueError('{} is not PSD'.format(str(KM))) 105 | 106 | 107 | @hyp_settings(max_examples=num_tests_psd_kernel, deadline=None, 108 | suppress_health_check=HealthCheck.all()) 109 | @given(strategies.integers(range_feature_dim[0], range_feature_dim[1]), 110 | strategies.integers(range_num_samples[0], range_num_samples[1]), 111 | strategies.integers(range_polynomial_degree[0], range_polynomial_degree[1]), 112 | strategies.floats(min_value=0, max_value=1e3, 113 | allow_nan=False, allow_infinity=False)) 114 | def test_polynomial_kernel(sample_dim, num_samples, 115 | poly_degree, poly_intercept): 116 | """Tests specific for Polynomial kernel.""" 117 | 118 | poly = PolyKernel(degree=poly_degree, b=poly_intercept, skip_input_checks=False) 119 | _test_for_all_kernels(poly, sample_dim) 120 | _test_func_is_valid_kernel(poly, sample_dim, num_samples) 121 | 122 | 123 | @hyp_settings(max_examples=num_tests_psd_kernel, deadline=None, 124 | suppress_health_check=HealthCheck.all()) 125 | @given(strategies.integers(range_feature_dim[0], range_feature_dim[1]), 126 | strategies.integers(range_num_samples[0], range_num_samples[1]), 127 | strategies.floats(min_value=0, max_value=1e6, 128 | allow_nan=False, allow_infinity=False)) 129 | def test_gaussian_kernel(sample_dim, num_samples, sigma): 130 | """Tests specific for Gaussian kernel.""" 131 | 132 | gaussian = GaussianKernel(sigma=sigma, skip_input_checks=False) 133 | _test_for_all_kernels(gaussian, sample_dim) 134 | _test_func_is_valid_kernel(gaussian, sample_dim, num_samples) 135 | 136 | @hyp_settings(max_examples=num_tests_psd_kernel, deadline=None, 137 | suppress_health_check=HealthCheck.all()) 138 | @given(strategies.integers(range_feature_dim[0], range_feature_dim[1]), 139 | strategies.integers(range_num_samples[0], range_num_samples[1])) 140 | def test_linear_kernel(sample_dim, num_samples): 141 | """Tests specific for Linear kernel.""" 142 | 143 | linear = LinearKernel(skip_input_checks=False) 144 | _test_for_all_kernels(linear, sample_dim) 145 | _test_func_is_valid_kernel(linear, sample_dim, num_samples) 146 | 147 | 148 | @hyp_settings(max_examples=num_tests_psd_kernel, deadline=None, 149 | suppress_health_check=HealthCheck.all()) 150 | @given(strategies.integers(range_feature_dim[0], range_feature_dim[1]), 151 | strategies.integers(range_num_samples[0], range_num_samples[1]), 152 | strategies.floats(min_value=0, max_value=1e6, 153 | allow_nan=False, allow_infinity=False)) 154 | def test_laplacian_kernel(sample_dim, num_samples, gamma): 155 | """Tests specific for Laplacian kernel.""" 156 | 157 | laplacian = LaplacianKernel(gamma=gamma, skip_input_checks=False) 158 | _test_for_all_kernels(laplacian, sample_dim) 159 | _test_func_is_valid_kernel(laplacian, sample_dim, num_samples) 160 | 161 | 162 | @hyp_settings(max_examples=num_tests_psd_kernel, deadline=None, 163 | suppress_health_check=HealthCheck.all()) 164 | @given(strategies.integers(range_feature_dim[0], range_feature_dim[1]), 165 | strategies.integers(range_num_samples[0], range_num_samples[1]), 166 | strategies.floats(min_value=0, max_value=1e6, 167 | allow_nan=False, allow_infinity=False), 168 | strategies.floats(min_value=0, max_value=1e6, 169 | allow_nan=False, allow_infinity=False) 170 | ) 171 | def test_sigmoid_kernel(sample_dim, num_samples, gamma, offset): 172 | """Tests specific for sigmoid kernel.""" 173 | 174 | sigmoid = SigmoidKernel(gamma=gamma, offset=offset, skip_input_checks=False) 175 | # sigmoid is not always PSD 176 | _test_for_all_kernels(sigmoid, sample_dim, check_PSDness=False) 177 | 178 | 179 | @hyp_settings(max_examples=num_tests_psd_kernel, deadline=None, 180 | suppress_health_check=HealthCheck.all()) 181 | @given(strategies.integers(range_feature_dim[0], range_feature_dim[1]), 182 | strategies.integers(range_num_samples[0], range_num_samples[1]), 183 | strategies.floats(min_value=0, max_value=1e6, 184 | allow_nan=False, allow_infinity=False)) 185 | def test_chi2_kernel(sample_dim, num_samples, gamma): 186 | """Tests specific for Laplacian kernel.""" 187 | 188 | chi2 = Chi2Kernel(gamma=gamma, skip_input_checks=False) 189 | _test_for_all_kernels(chi2, sample_dim) 190 | _test_func_is_valid_kernel(chi2, sample_dim, num_samples) 191 | 192 | 193 | def test_chi2_kernel_misc(): 194 | """Tests specific for Laplacian kernel.""" 195 | 196 | chi2 = Chi2Kernel() 197 | x = gen_random_array(10) 198 | y = gen_random_array(10) 199 | 200 | neg_x = x - x.mean() # some values would be negative 201 | pos_y = np.abs(y) 202 | 203 | from kernelmethods.config import Chi2NegativeValuesException 204 | with raises(Chi2NegativeValuesException): 205 | chi2(neg_x, pos_y) 206 | with raises(Chi2NegativeValuesException): 207 | chi2(pos_y, neg_x) 208 | 209 | @hyp_settings(max_examples=num_tests_psd_kernel, deadline=None, 210 | suppress_health_check=HealthCheck.all()) 211 | @given(strategies.integers(range_feature_dim[0], range_feature_dim[1]), 212 | strategies.floats(min_value=1, max_value=1e6, 213 | allow_nan=False, allow_infinity=False)) 214 | def test_Hadamard_kernel(sample_dim, alpha): 215 | """Tests specific for Hadamard kernel.""" 216 | 217 | had = HadamardKernel(alpha=alpha, skip_input_checks=False) 218 | _test_for_all_kernels(had, sample_dim, check_PSDness=False) 219 | 220 | 221 | def test_Hadamard_kernel_misc(): 222 | """Tests specific for Hadamard kernel.""" 223 | 224 | with raises(ValueError): 225 | had = HadamardKernel(alpha=0) 226 | 227 | -------------------------------------------------------------------------------- /kernelmethods/tests/test_operations.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from kernelmethods.base import KernelMatrix 3 | from kernelmethods.config import KMNormError 4 | from kernelmethods.numeric_kernels import LinearKernel 5 | from kernelmethods.operations import (alignment_centered, center_km, frobenius_norm, 6 | frobenius_product, is_PSD, linear_combination, 7 | normalize_km, normalize_km_2sample) 8 | from kernelmethods.sampling import make_kernel_bucket 9 | from numpy.random import randn 10 | from pytest import raises, warns 11 | 12 | num_samples = np.random.randint(20, 50) 13 | sample_dim = 3 # 2 14 | target_label_set = [1, 2] 15 | 16 | sample_data = np.random.rand(num_samples, sample_dim) 17 | target_labels = np.random.choice(target_label_set, (num_samples, 1)) 18 | 19 | A = np.random.rand(4, 4) 20 | B = np.random.rand(4, 4) 21 | 22 | 23 | def gen_random_array(dim): 24 | """To better control precision and type of floats""" 25 | 26 | # TODO input sparse arrays for test 27 | return np.random.rand(dim) 28 | 29 | 30 | def gen_random_sample(num_samples, sample_dim): 31 | """To better control precision and type of floats""" 32 | 33 | # TODO input sparse arrays for test 34 | return np.random.rand(num_samples, sample_dim) 35 | 36 | 37 | def test_psd(): 38 | with raises(TypeError): 39 | is_PSD([2, 34, 23]) 40 | 41 | if is_PSD(np.random.rand(2, 4)): 42 | raise ValueError('Non-square matrix is being deemed PSD!!! Big error!') 43 | 44 | if is_PSD(np.random.rand(5, 5)): 45 | raise ValueError('Non-symmetric matrix is being deemed PSD!!! Big error!') 46 | 47 | negative_semi_def_matrix = np.array([[-1, 0], [0, -1]]) 48 | if is_PSD(negative_semi_def_matrix): 49 | raise ValueError('Implementation for PSD check failed. ' 50 | 'negative_semi_def_matrix is approved as PSD.') 51 | 52 | not_psd_matrices = (np.array([[1, 1.00001, 1], 53 | [1.00001, 1, 1.00001], 54 | [1, 1.00001, 1]]), 55 | np.array([[3, 4], 56 | [4, 3]])) 57 | for not_psd_matrix in not_psd_matrices: 58 | assert is_PSD(not_psd_matrix) is False 59 | 60 | 61 | def test_frobenius_product(): 62 | A = np.array([[1, 2], [3, 4]]) 63 | B = np.array([[4, 1], [2, 5]]) 64 | C = np.array([[10, 2, 5], [6, 8, 6]]) 65 | 66 | fprod = frobenius_product(A, B) 67 | if not np.isclose(fprod, 32): 68 | raise ValueError('Frobenius product implementation is wrong!') 69 | 70 | with raises(ValueError): 71 | frobenius_product(B, C) 72 | 73 | fnorm = frobenius_norm(A) 74 | assert np.isreal(fnorm) 75 | if not np.isclose(fnorm, np.sqrt(frobenius_product(A, A))): 76 | raise ValueError('Frobenius norm implementation is wrong!') 77 | 78 | 79 | def test_centering(): 80 | with raises(ValueError): 81 | center_km(np.full((3, 4), 1)) 82 | 83 | with raises(ValueError): 84 | center_km([]) 85 | 86 | mat_size = 10 87 | kmc = center_km(np.random.rand(mat_size, mat_size)) 88 | assert kmc.shape == (mat_size, mat_size) 89 | 90 | 91 | def test_normalize(): 92 | with raises(ValueError): 93 | normalize_km(np.full((3, 4), 1)) 94 | 95 | with raises(KMNormError): 96 | normalize_km(np.zeros((5, 5))) 97 | 98 | kmc = normalize_km(randn(10, 10)) 99 | 100 | 101 | def test_normalize_two_sample(): 102 | num_samples_one = 3 103 | num_samples_two = 4 104 | with raises(ValueError): 105 | normalize_km_2sample(randn(num_samples_one, num_samples_two), 106 | randn(num_samples_two + 1, 1), []) 107 | 108 | with raises(ValueError): 109 | normalize_km_2sample(randn(num_samples_one, num_samples_two), 110 | randn(num_samples_one, 1), 111 | randn(num_samples_two - 1, 1), ) 112 | 113 | with raises((KMNormError, ValueError, RuntimeError)): 114 | normalize_km_2sample(np.zeros((5, 5)), np.zeros((5, 1)), np.zeros((5, 1))) 115 | 116 | with raises(NotImplementedError): 117 | normalize_km_2sample(randn(num_samples_one, num_samples_two), 118 | randn(num_samples_one, 1), 119 | randn(num_samples_two, 1), 120 | method='notcosine') 121 | 122 | with raises(NotImplementedError): 123 | normalize_km(randn(10, 10), method='notcosine') 124 | 125 | # the following should work 126 | _ = normalize_km(randn(10, 10)) 127 | # adding 0.1 to diagonals to avoid norm errors with denom close to 0 128 | diag_one = np.abs(randn(num_samples_one, 1)) + 0.1 129 | diag_two = np.abs(randn(num_samples_two, 1)) + 0.1 130 | _ = normalize_km_2sample(np.abs(randn(num_samples_one, num_samples_two)), 131 | diag_one, diag_two, method='cosine') 132 | 133 | 134 | def test_alignment_centered(): 135 | km1 = KernelMatrix(kernel=LinearKernel()) 136 | km1.attach_to(gen_random_sample(num_samples, sample_dim)) 137 | 138 | km2 = KernelMatrix(kernel=LinearKernel()) 139 | km2.attach_to(gen_random_sample(num_samples, sample_dim)) 140 | 141 | km3_bad_size = KernelMatrix(kernel=LinearKernel()) 142 | km3_bad_size.attach_to(gen_random_sample(num_samples + 2, sample_dim)) 143 | 144 | with raises(ValueError): 145 | alignment_centered(km1.full, km3_bad_size.full) 146 | 147 | # bad type : must be ndarray 148 | with raises(TypeError): 149 | alignment_centered(km1, km2.full) 150 | 151 | # bad type : must be ndarray 152 | with raises(TypeError): 153 | alignment_centered(km1.full, km2) 154 | 155 | for flag in (True, False): 156 | _ = alignment_centered(km1.full, km2.full, centered_already=flag) 157 | 158 | with raises(ValueError): 159 | _ = alignment_centered(np.zeros((10, 10)), randn(10, 10), 160 | value_if_zero_division='raise') 161 | 162 | return_val_requested = 'random_set_value' 163 | with warns(UserWarning): 164 | ret_value = alignment_centered(randn(10, 10), 165 | np.zeros((10, 10)), 166 | value_if_zero_division=return_val_requested) 167 | if ret_value != return_val_requested: 168 | raise ValueError('Not returning the value requested in case of error!') 169 | 170 | 171 | def test_linear_comb(): 172 | kset = make_kernel_bucket('light') 173 | weights = randn(kset.size) 174 | kset.attach_to(sample_data) 175 | lc = linear_combination(kset, weights) 176 | 177 | with raises(ValueError): 178 | lc = linear_combination(kset, randn(kset.size + 1)) 179 | 180 | zero_weights = np.zeros((kset.size,1)) 181 | lc0 = linear_combination(kset, zero_weights) 182 | if not np.isclose(lc0.max(), 0.0): 183 | raise ValueError('zero weights do not lead to zero KM!') 184 | 185 | with raises(RuntimeError): 186 | lc0 = linear_combination(kset, zero_weights, norm_weights=True) 187 | -------------------------------------------------------------------------------- /kernelmethods/tests/test_ranking.py: -------------------------------------------------------------------------------- 1 | 2 | from kernelmethods.sampling import make_kernel_bucket 3 | from kernelmethods.ranking import find_optimal_kernel, rank_kernels, \ 4 | alignment_ranking, min_max_scale, CV_ranking, get_estimator 5 | import numpy as np 6 | from pytest import raises, warns 7 | 8 | kb = make_kernel_bucket() 9 | 10 | def test_misc(): 11 | 12 | raises(TypeError, find_optimal_kernel, 'bucket', None, None) 13 | 14 | with raises(NotImplementedError): 15 | rank_kernels(kb, None, method='align/corr') 16 | 17 | -------------------------------------------------------------------------------- /kernelmethods/tests/test_sampling.py: -------------------------------------------------------------------------------- 1 | from kernelmethods.numeric_kernels import (PolyKernel, GaussianKernel, 2 | SigmoidKernel, 3 | LaplacianKernel) 4 | import numpy as np 5 | from kernelmethods.config import KernelMethodsException, kernel_bucket_strategies 6 | from kernelmethods.numeric_kernels import (GaussianKernel, LaplacianKernel, 7 | PolyKernel, SigmoidKernel) 8 | from kernelmethods.sampling import (KernelBucket, correlation_km, ideal_kernel, 9 | make_kernel_bucket, pairwise_similarity) 10 | from pytest import raises, warns 11 | 12 | num_samples = 50 # 9 13 | sample_dim = 3 # 2 14 | target_label_set = [1, 2] 15 | 16 | sample_data = np.random.rand(num_samples, sample_dim) 17 | target_labels = np.random.choice(target_label_set, (num_samples, 1)) 18 | 19 | A = np.random.rand(4, 4) 20 | B = np.random.rand(4, 4) 21 | 22 | 23 | def gen_random_array(dim): 24 | """To better control precision and type of floats""" 25 | 26 | # TODO input sparse arrays for test 27 | return np.random.rand(dim) 28 | 29 | 30 | def gen_random_sample(num_samples, sample_dim): 31 | """To better control precision and type of floats""" 32 | 33 | # TODO input sparse arrays for test 34 | return np.random.rand(num_samples, sample_dim) 35 | 36 | 37 | kset = make_kernel_bucket('light') 38 | kset.attach_to(sample_data) 39 | 40 | 41 | def test_make_bucket(): 42 | with warns(UserWarning): 43 | _ = make_kernel_bucket(kset) 44 | 45 | with raises(ValueError): 46 | _ = make_kernel_bucket('blah_invalid_strategy') 47 | 48 | # ensure correct values work 49 | for strategy in kernel_bucket_strategies: 50 | _ = make_kernel_bucket(strategy=strategy) 51 | 52 | 53 | def test_KB_class(): 54 | for param in ['normalize_kernels', 'skip_input_checks']: 55 | for invalid_value in (1, 'str', 34., 2 + 4j): 56 | with raises(TypeError): 57 | _ = KernelBucket(**{param: invalid_value}) 58 | 59 | 60 | def test_add_parametrized_kernels(): 61 | kb = KernelBucket() 62 | for invalid_kfunc in ('kfunc', gen_random_sample, KernelBucket,): 63 | with raises(KernelMethodsException): 64 | kb.add_parametrized_kernels(invalid_kfunc, 'param', (1,)) 65 | 66 | for invalid_values in ('string', gen_random_sample, [], KernelBucket): 67 | with raises(ValueError): 68 | kb.add_parametrized_kernels(PolyKernel, 'param', invalid_values) 69 | 70 | for invalid_param in ('__param__', (), 'blahblah', 5): 71 | for ker_func in (PolyKernel, LaplacianKernel, GaussianKernel, SigmoidKernel): 72 | with raises(ValueError): 73 | kb.add_parametrized_kernels(ker_func, invalid_param, 2) 74 | 75 | 76 | def test_ideal_kernel(): 77 | ik = ideal_kernel(np.random.randint(1, 5, num_samples)) 78 | if ik.size != num_samples ** 2: 79 | raise ValueError('ideal kernel size unexpected') 80 | 81 | 82 | def test_correlation_km(): 83 | corr_coef = correlation_km(np.random.rand(10, 10), np.random.rand(10, 10)) 84 | if corr_coef > 1 or corr_coef < -1: 85 | raise ValueError('correlation out of bounds [-1, 1]') 86 | 87 | 88 | def test_pairwise_similarity(): 89 | ps = pairwise_similarity(kset) 90 | if ps.shape != (kset.size, kset.size): 91 | raise ValueError('invalid shape for pairwise_similarity computation') 92 | -------------------------------------------------------------------------------- /kernelmethods/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from pytest import raises 4 | 5 | from kernelmethods.numeric_kernels import (GaussianKernel, LaplacianKernel, 6 | LinearKernel, PolyKernel) 7 | from kernelmethods.utils import (check_callable, check_input_arrays, 8 | check_operation_kernel_matrix, ensure_ndarray_1D, 9 | ensure_ndarray_2D, 10 | get_callable_name, not_symmetric) 11 | 12 | default_feature_dim = 10 13 | range_feature_dim = [10, 500] 14 | range_num_samples = [50, 500] 15 | num_samples = np.random.randint(20) 16 | sample_dim = np.random.randint(10) 17 | 18 | range_polynomial_degree = [2, 10] # degree=1 is tested in LinearKernel() 19 | 20 | np.random.seed(42) 21 | 22 | # choosing skip_input_checks=False will speed up test runs 23 | # default values for parameters 24 | SupportedKernels = (GaussianKernel(), PolyKernel(), LinearKernel(), 25 | LaplacianKernel()) 26 | num_tests_psd_kernel = 3 27 | 28 | 29 | def test_check_input_arrays(): 30 | 31 | with raises(ValueError): 32 | check_input_arrays(np.random.rand(10, 5), np.random.rand(5, 4)) 33 | 34 | with raises(ValueError): 35 | check_input_arrays(np.random.rand(10), np.random.rand(5)) 36 | 37 | # from scipy.sparse import csr_matrix 38 | # s1 = csr_matrix((3,4)) 39 | # s2 = csr_matrix((3, 4)) 40 | # _, _ = check_input_arrays(s1, s2) 41 | 42 | def test_valid_op(): 43 | 44 | for invalid_op in ('foo', 'bar', 'adition', 'some'): 45 | with raises(ValueError): 46 | check_operation_kernel_matrix(invalid_op) 47 | 48 | from kernelmethods.config import VALID_KERNEL_MATRIX_OPS 49 | for valid_op in VALID_KERNEL_MATRIX_OPS: 50 | _ = check_operation_kernel_matrix(valid_op) 51 | 52 | def test_ensure_array_dim(): 53 | 54 | with raises(ValueError): 55 | ensure_ndarray_2D(np.random.rand(10, 5), ensure_num_cols=3) 56 | 57 | with raises(ValueError): 58 | ensure_ndarray_2D(np.random.rand(10), ensure_num_cols=3) 59 | 60 | with raises(ValueError): 61 | ensure_ndarray_1D(np.random.rand(10, 5)) 62 | 63 | with raises(ValueError): 64 | ensure_ndarray_1D(np.random.rand(10, 5, 10)) 65 | 66 | def test_misc(): 67 | 68 | _ = get_callable_name(test_ensure_array_dim, 'test') 69 | _ = get_callable_name('test_ensure_array_dim', None) 70 | 71 | with raises(TypeError): 72 | check_callable('kdjkj') 73 | 74 | def func_with_less_than_min_args(): return None 75 | 76 | with raises(TypeError): 77 | check_callable(func_with_less_than_min_args) 78 | 79 | with raises(TypeError): 80 | check_callable(func_with_less_than_min_args, 1) 81 | 82 | with raises(TypeError): 83 | check_callable(func_with_less_than_min_args, 3) 84 | 85 | assert not_symmetric(np.array([[1, 2], [1, 2]])) is True 86 | -------------------------------------------------------------------------------- /kernelmethods/utils.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from scipy.sparse import issparse 4 | from kernelmethods import config 5 | from collections.abc import Iterable 6 | 7 | def check_input_arrays(x, y, ensure_dtype=np.number): 8 | """ 9 | Ensures the inputs are 10 | 1) 1D arrays (not matrices) 11 | 2) with compatible size 12 | 3) of a particular data type 13 | and hence are safe to operate on. 14 | 15 | Parameters 16 | ---------- 17 | x : iterable 18 | 19 | y : iterable 20 | 21 | ensure_dtype : dtype 22 | 23 | Returns 24 | ------- 25 | x : ndarray 26 | 27 | y : ndarray 28 | 29 | """ 30 | 31 | x = ensure_ndarray_1D(x, ensure_dtype) 32 | y = ensure_ndarray_1D(y, ensure_dtype) 33 | 34 | if x.size != y.size: 35 | raise ValueError('x (n={}) and y (n={}) differ in size! ' 36 | 'They must be of same length'.format(x.size, y.size)) 37 | 38 | return x, y 39 | 40 | 41 | def ensure_ndarray_2D(array, ensure_dtype=np.number, ensure_num_cols=None): 42 | """Converts the input to a numpy array and ensure it is 1D.""" 43 | 44 | if not isinstance(array, np.ndarray): 45 | array = np.asarray(array) 46 | 47 | # squeezing only 3rd dim if they are singleton, leaving 1st & 2nd dim alone 48 | axes_to_sqz = tuple(ax for ax, sz in enumerate(array.shape) if sz==1 and ax>1) 49 | array = np.squeeze(array, axis=axes_to_sqz) 50 | 51 | array = ensure_ndarray_size(array, ensure_dtype=ensure_dtype, ensure_num_dim=2) 52 | 53 | if ensure_num_cols is not None and array.shape[1] != ensure_num_cols: 54 | raise ValueError('The number of columns differ from expected {}' 55 | ''.format(ensure_num_cols)) 56 | 57 | return array 58 | 59 | 60 | def ensure_ndarray_1D(array, ensure_dtype=np.number): 61 | """Converts the input to a numpy array and ensure it is 1D.""" 62 | 63 | if not isinstance(array, np.ndarray): 64 | array = np.asarray(array) 65 | 66 | # squeezing only 2nd, 3rd dim if they are singleton, leaving 1st dim alone 67 | axes_to_sqz = tuple(ax for ax, sz in enumerate(array.shape) if sz==1 and ax>0) 68 | array = np.squeeze(array, axis=axes_to_sqz) 69 | 70 | return ensure_ndarray_size(array, ensure_dtype=ensure_dtype, ensure_num_dim=1) 71 | 72 | 73 | def ensure_ndarray_size(array, ensure_dtype=np.number, ensure_num_dim=1): 74 | """Converts the input to a numpy array and ensure it is of specified dim.""" 75 | 76 | if array.ndim != ensure_num_dim: 77 | raise ValueError('array must be {}-dimensional! ' 78 | 'It has {} dims with shape {} ' 79 | ''.format(ensure_num_dim, array.ndim, array.shape)) 80 | 81 | if not np.issubdtype(ensure_dtype, array.dtype): 82 | prev_dtype = array.dtype 83 | try: 84 | array = array.astype(ensure_dtype) 85 | except: 86 | raise ValueError('Unable to recast input dtype from {} to required {}!' 87 | ''.format(prev_dtype, ensure_dtype)) 88 | 89 | return array 90 | 91 | 92 | def check_callable(input_func, min_num_args=2): 93 | """Ensures the input func 1) is callable, and 2) can accept a min # of args""" 94 | 95 | if not callable(input_func): 96 | raise TypeError('Input function must be callable!') 97 | 98 | from inspect import signature 99 | # would not work for C/builtin functions such as numpy.dot 100 | func_signature = signature(input_func) 101 | 102 | if len(func_signature.parameters) < min_num_args: 103 | raise TypeError('Input func must accept atleast {} inputs'.format(min_num_args)) 104 | 105 | return input_func 106 | 107 | 108 | def get_callable_name(input_func, name=None): 109 | """Returns the callable name""" 110 | 111 | if name is None: 112 | if hasattr(input_func, '__name__'): 113 | return input_func.__name__ 114 | else: 115 | return '' 116 | else: 117 | return str(name) 118 | 119 | _float_eps = np.finfo('float').eps 120 | 121 | def _ensure_min_eps(x): 122 | return np.maximum(_float_eps, x) 123 | 124 | def not_symmetric(matrix): 125 | """Returns true if the input matrix is not symmetric.""" 126 | 127 | if not np.isclose(matrix, matrix.T).all(): 128 | return True 129 | else: 130 | return False 131 | 132 | def check_operation_kernel_matrix(operation): 133 | """Validates whether input is a valid operation on KernelMatrices""" 134 | 135 | opr = operation.lower() 136 | if opr not in config.VALID_KERNEL_MATRIX_OPS: 137 | raise ValueError('Invalid kernel matrix operation - must be one of:\n{}' 138 | ''.format(config.VALID_KERNEL_MATRIX_OPS)) 139 | 140 | return opr 141 | 142 | 143 | def min_max_scale(array): 144 | """Rescale the array values from 0 to 1 via min-max normalization.""" 145 | 146 | array = np.array(array) 147 | min_val = array.min() 148 | return (array - min_val) / (np.max(array) - min_val) 149 | 150 | 151 | def contains_nan_inf(matrix): 152 | """ 153 | Helper func to check for the presence of NaN or Inf. 154 | 155 | Returns True if any element is not finite (Inf) or NaN. Returns False otherwise. 156 | 157 | This is designed to works for both dense and sparse matrices! 158 | """ 159 | 160 | if issparse(matrix): 161 | matrix = matrix.todense() 162 | 163 | if (not np.isfinite(matrix).all()) \ 164 | or (np.isnan(matrix).any()): 165 | return True 166 | else: 167 | return False 168 | 169 | 170 | def is_iterable_but_not_str(input_obj, min_length=1): 171 | """Boolean check for iterables that are not strings and of a minimum length""" 172 | 173 | if not (not isinstance(input_obj, str) and isinstance(input_obj, Iterable)): 174 | return False 175 | 176 | if len(input_obj) < min_length: 177 | return False 178 | else: 179 | return True 180 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = -s -v 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scipy 2 | numpy 3 | scikit-learn 4 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | scipy 2 | numpy 3 | scikit-learn 4 | pytest 5 | pytest-runner 6 | pyyaml 7 | python-coveralls 8 | hypothesis 9 | bumpversion 10 | wheel 11 | watchdog 12 | flake8 13 | coverage 14 | Sphinx 15 | twine 16 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | 2 | [versioneer] 3 | VCS = git 4 | style = pep440 5 | versionfile_source = kernelmethods/_version.py 6 | versionfile_build = kernelmethods/_version.py 7 | tag_prefix = 8 | parentdir_prefix = kernelmethods- 9 | 10 | [bumpversion] 11 | current_version = 0.0.1 12 | commit = True 13 | tag = True 14 | 15 | [bumpversion:file:setup.py] 16 | search = version='{current_version}' 17 | replace = version='{new_version}' 18 | 19 | [bumpversion:file:kernelmethods/__init__.py] 20 | search = __version__ = '{current_version}' 21 | replace = __version__ = '{new_version}' 22 | 23 | [bdist_wheel] 24 | universal = 1 25 | 26 | [flake8] 27 | exclude = docs 28 | 29 | [aliases] 30 | # Define setup.py command aliases here 31 | test = pytest 32 | 33 | [tool:pytest] 34 | collect_ignore = ['setup.py'] 35 | 36 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """The setup script.""" 5 | 6 | from setuptools import setup, find_packages 7 | import versioneer 8 | 9 | with open('README.rst') as readme_file: 10 | readme = readme_file.read() 11 | 12 | with open('HISTORY.rst') as history_file: 13 | history = history_file.read() 14 | 15 | requirements = ['scipy', 16 | 'numpy'] 17 | 18 | setup_requirements = ['pytest-runner', ] 19 | 20 | test_requirements = ['pytest', ] + requirements 21 | 22 | setup(name='kernelmethods', 23 | version=versioneer.get_version(), 24 | cmdclass=versioneer.get_cmdclass(), 25 | author="Pradeep Reddy Raamana", 26 | author_email='raamana@gmail.com', 27 | classifiers=[ 28 | 'Development Status :: 2 - Pre-Alpha', 29 | 'Intended Audience :: Developers', 30 | 'License :: OSI Approved :: Apache Software License', 31 | 'Natural Language :: English', 32 | 'Programming Language :: Python :: 3', 33 | 'Programming Language :: Python :: 3.4', 34 | 'Programming Language :: Python :: 3.5', 35 | 'Programming Language :: Python :: 3.6', 36 | 'Programming Language :: Python :: 3.7', 37 | ], 38 | description="kernel methods and classes", 39 | install_requires=requirements, 40 | license="Apache Software License 2.0", 41 | long_description=readme + '\n\n' + history, 42 | include_package_data=True, 43 | keywords='kernelmethods', 44 | packages=find_packages(include=['kernelmethods']), 45 | setup_requires=setup_requirements, 46 | test_suite='tests', 47 | tests_require=test_requirements, 48 | url='https://github.com/raamana/kernelmethods', 49 | zip_safe=False, 50 | ) 51 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27, py34, py35, py36, flake8 3 | 4 | [travis] 5 | python = 6 | 3.6: py36 7 | 3.5: py35 8 | 3.4: py34 9 | 2.7: py27 10 | 11 | [testenv:flake8] 12 | basepython = python 13 | deps = flake8 14 | commands = flake8 kernelmethods 15 | 16 | [testenv] 17 | setenv = 18 | PYTHONPATH = {toxinidir} 19 | deps = 20 | -r{toxinidir}/requirements_dev.txt 21 | ; If you want to make tox run the tests with the same versions, create a 22 | ; requirements.txt with the pinned versions and uncomment the following line: 23 | ; -r{toxinidir}/requirements.txt 24 | commands = 25 | pip install -U pip 26 | py.test --basetemp={envtmpdir} 27 | 28 | 29 | --------------------------------------------------------------------------------