├── .gitignore ├── .travis.yml ├── COPYING.MIT ├── ChangeLog ├── INSTALL.rst ├── MANIFEST.in ├── Makefile ├── README.rst ├── docs ├── Makefile └── source │ ├── adaboost.rst │ ├── api.rst │ ├── benchmarks.rst │ ├── clustering.rst │ ├── conf.py │ ├── examples.rst │ ├── extensions.rst │ ├── featureselection.rst │ ├── index.rst │ ├── milksets.rst │ ├── nfoldcrossvalidation.rst │ ├── parallel.rst │ ├── principles.rst │ ├── randomforests.rst │ ├── randomnumbers.rst │ ├── readme.rst │ └── supervised.rst ├── get-eigen.sh ├── milk ├── __init__.py ├── active │ ├── __init__.py │ ├── eimpact.py │ └── uncertainty.py ├── demos │ ├── __init__.py │ ├── adaboost.py │ ├── rf_wine_2d.py │ └── svm-decision-boundary.py ├── ext │ ├── __init__.py │ └── jugparallel.py ├── measures │ ├── __init__.py │ ├── cluster_agreement.py │ ├── curves.py │ ├── measures.py │ └── nfoldcrossvalidation.py ├── milk_version.py ├── nfoldcrossvalidation.py ├── supervised │ ├── __init__.py │ ├── _lasso.cpp │ ├── _perceptron.cpp │ ├── _svm.cpp │ ├── _tree.cpp │ ├── adaboost.py │ ├── base.py │ ├── classifier.py │ ├── defaultclassifier.py │ ├── defaultlearner.py │ ├── featureselection.py │ ├── gridsearch.py │ ├── grouped.py │ ├── knn.py │ ├── lasso.py │ ├── logistic.py │ ├── multi.py │ ├── multi_label.py │ ├── multi_view.py │ ├── normalise.py │ ├── parzen.py │ ├── perceptron.py │ ├── precluster.py │ ├── precluster_learner.py │ ├── randomforest.py │ ├── set2binary_array.py │ ├── svm.py │ ├── tree.py │ └── weighted_voting_adaboost.py ├── tests │ ├── __init__.py │ ├── data │ │ ├── __init__.py │ │ ├── jugparallel_jugfile.py │ │ ├── jugparallel_kmeans_jugfile.py │ │ └── regression-2-Dec-2009.pp.gz │ ├── fast_classifier.py │ ├── test_adaboost.py │ ├── test_affinity.py │ ├── test_basic.py │ ├── test_curves.py │ ├── test_defaultclassifier.py │ ├── test_defaultlearner.py │ ├── test_ecoc_learner.py │ ├── test_ext_jugparallel.py │ ├── test_featureselection.py │ ├── test_fisher.py │ ├── test_gaussianmixture.py │ ├── test_gridsearch.py │ ├── test_grouped.py │ ├── test_kmeans.py │ ├── test_knn.py │ ├── test_lasso.py │ ├── test_logistic.py │ ├── test_measures.py │ ├── test_measures_clusters.py │ ├── test_multi.py │ ├── test_multi_label.py │ ├── test_multi_view.py │ ├── test_nfoldcrossvalidation.py │ ├── test_nfoldcrossvalidation_regression.py │ ├── test_nnmf.py │ ├── test_normalise.py │ ├── test_normaliselabels.py │ ├── test_parzen.py │ ├── test_pca.py │ ├── test_pdist.py │ ├── test_perceptron.py │ ├── test_precluster_learner.py │ ├── test_regression.py │ ├── test_regression_constant_features.py │ ├── test_rf.py │ ├── test_set2binary_array.py │ ├── test_som.py │ ├── test_svm.py │ ├── test_svm_sigmoidal.py │ ├── test_tree.py │ └── test_utils.py ├── unsupervised │ ├── __init__.py │ ├── _kmeans.cpp │ ├── _som.cpp │ ├── affinity.py │ ├── gaussianmixture.py │ ├── kmeans.py │ ├── nnmf │ │ ├── __init__.py │ │ ├── hoyer.py │ │ └── lee_seung.py │ ├── normalise.py │ ├── parzen.py │ ├── pca.py │ ├── pdist.py │ └── som.py ├── utils │ ├── __init__.py │ ├── parallel.py │ ├── utils.h │ └── utils.py └── wrapper │ ├── __init__.py │ └── wraplibsvm.py ├── readthedocs-requirements.txt ├── setup.py └── template.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | milk/supervised/_svm.so 3 | milk/supervised/_tree.so 4 | milk/supervised/_perceptron.so 5 | milk/supervised/_lasso.so 6 | milk/unsupervised/_som.so 7 | milk/unsupervised/_kmeans.so 8 | build 9 | dist/ 10 | milk.egg-info 11 | docs/milk/ 12 | *.cpython*.so 13 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.3" 5 | - "3.4" 6 | before_install: 7 | - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh 8 | - chmod +x miniconda.sh 9 | - ./miniconda.sh -b 10 | - export PATH=/home/travis/miniconda/bin:$PATH 11 | - conda update --yes conda 12 | - sudo apt-get update -qq 13 | - sudo apt-get install -qq libatlas-dev liblapack-dev gfortran 14 | - sudo apt-get install -qq libeigen3-dev 15 | install: 16 | - conda create --yes -n condaenv python=$TRAVIS_PYTHON_VERSION numpy=1.9 17 | - conda install --yes -n condaenv scipy matplotlib pillow nose pip 18 | - conda install --yes -n condaenv -c https://conda.binstar.org/luispedro imread 19 | - source activate condaenv 20 | - pip install --quiet coveralls 21 | - pip install milksets 22 | - make debug 23 | script: nosetests 24 | -------------------------------------------------------------------------------- /COPYING.MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2008-2011 Luis Pedro Coelho 2 | 3 | Permission is hereby granted, free of charge, to any person 4 | obtaining a copy of this software and associated documentation 5 | files (the "Software"), to deal in the Software without 6 | restriction, including without limitation the rights to use, 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the 9 | Software is furnished to do so, subject to the following 10 | conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /INSTALL.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | Building milk 3 | ============= 4 | 5 | To install dependencies in Ubuntu:: 6 | 7 | sudo apt-get install python-numpy python-scipy libeigen3-dev 8 | 9 | The following should work:: 10 | 11 | python setup.py install 12 | 13 | A C++ compiler is required. On Windows, you might need to specify the compiler. 14 | For example:: 15 | 16 | python setup.py install --compiler=mingw32 17 | 18 | If you have mingw installed. 19 | 20 | --------------- 21 | Building on OSX 22 | --------------- 23 | 24 | Because the standard library used with OS X doesn't include the C++11 libraries by default, you will need to specify 25 | it in ``setup.py``:: 26 | 27 | extra_compile_args=['-std=c++0x', '-stdlib=libc++'], 28 | 29 | is what the final line should look like. 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include INSTALL.rst 3 | include COPYING.MIT 4 | include milk/tests/data/* 5 | include milk/utils/utils.h 6 | recursive-include milk/supervised/eigen3 * 7 | 8 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SOURCES = milk/*/*.cpp 2 | 3 | debug: $(SOURCES) 4 | DEBUG=2 python setup.py build --build-lib=. 5 | 6 | fast: $(SOURCES) 7 | python setup.py build --build-lib=. 8 | 9 | clean: 10 | rm -rf build milk/*/*.so 11 | 12 | tests: debug 13 | nosetests -vx 14 | 15 | docs: 16 | rm -rf build/docs 17 | cd docs && make html && cp -r build/html ../build/docs 18 | @echo python setup.py upload_docs 19 | 20 | .PHONY: clean docs tests fast debug 21 | 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ============================== 2 | MILK: MACHINE LEARNING TOOLKIT 3 | ============================== 4 | Machine Learning in Python 5 | -------------------------- 6 | 7 | Milk is a machine learning toolkit in Python. 8 | 9 | Its focus is on supervised classification with several classifiers available: 10 | SVMs (based on libsvm), k-NN, random forests, decision trees. It also performs 11 | feature selection. These classifiers can be combined in many ways to form 12 | different classification systems. 13 | 14 | For unsupervised learning, milk supports k-means clustering and affinity 15 | propagation. 16 | 17 | Milk is flexible about its inputs. It optimised for numpy arrays, but can often 18 | handle anything (for example, for SVMs, you can use any dataype and any kernel 19 | and it does the right thing). 20 | 21 | There is a strong emphasis on speed and low memory usage. Therefore, most of 22 | the performance sensitive code is in C++. This is behind Python-based 23 | interfaces for convenience. 24 | 25 | To learn more, check the docs at `http://packages.python.org/milk/ 26 | `_ or the code demos included with the source 27 | at ``milk/demos/``. 28 | 29 | Examples 30 | -------- 31 | 32 | Here is how to test how well you can classify some ``features,labels`` data, 33 | measured by cross-validation:: 34 | 35 | import numpy as np 36 | import milk 37 | features = np.random.rand(100,10) # 2d array of features: 100 examples of 10 features each 38 | labels = np.zeros(100) 39 | features[50:] += .5 40 | labels[50:] = 1 41 | confusion_matrix, names = milk.nfoldcrossvalidation(features, labels) 42 | print 'Accuracy:', confusion_matrix.trace()/float(confusion_matrix.sum()) 43 | 44 | If want to use a classifier, you instanciate a *learner object* and call its 45 | ``train()`` method:: 46 | 47 | import numpy as np 48 | import milk 49 | features = np.random.rand(100,10) 50 | labels = np.zeros(100) 51 | features[50:] += .5 52 | labels[50:] = 1 53 | learner = milk.defaultclassifier() 54 | model = learner.train(features, labels) 55 | 56 | # Now you can use the model on new examples: 57 | example = np.random.rand(10) 58 | print model.apply(example) 59 | example2 = np.random.rand(10) 60 | example2 += .5 61 | print model.apply(example2) 62 | 63 | There are several classification methods in the package, but they all use the 64 | same interface: ``train()`` returns a *model* object, which has an ``apply()`` 65 | method to execute on new instances. 66 | 67 | 68 | Details 69 | ------- 70 | License: MIT 71 | 72 | Author: Luis Pedro Coelho (with code from LibSVM and scikits.learn) 73 | 74 | API Documentation: `http://packages.python.org/milk/ `_ 75 | 76 | Mailing List: `http://groups.google.com/group/milk-users 77 | `__ 78 | 79 | Features 80 | -------- 81 | - SVMs. Using the libsvm solver with a pythonesque wrapper around it. 82 | - LASSO 83 | - K-means using as little memory as possible. It can cluster millions of 84 | instances efficiently. 85 | - Random forests 86 | - Self organising maps 87 | - Stepwise Discriminant Analysis for feature selection. 88 | - Non-negative matrix factorisation 89 | - Affinity propagation 90 | 91 | Recent History 92 | -------------- 93 | 94 | The ChangeLog file contains a more complete history. 95 | 96 | New in 0.6.1 (11 May 2015) 97 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 98 | - Fixed source distribution 99 | 100 | New in 0.6 (27 Apr 2015) 101 | ~~~~~~~~~~~~~~~~~~~~~~~~ 102 | - Update for Python 3 103 | 104 | New in 0.5.3 (19 Jun 2013) 105 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 106 | - Fix MDS for non-array inputs 107 | - Fix MDS bug 108 | - Add return_* arguments to kmeans 109 | - Extend zscore() to work on non-ndarrays 110 | - Add frac_precluster_learner 111 | - Work with older C++ compilers 112 | 113 | 114 | New in 0.5.2 (7 Mar 2013) 115 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 116 | - Fix distribution of Eigen with source 117 | 118 | New in 0.5.1 (11 Jan 2013) 119 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 120 | - Add subspace projection kNN 121 | - Export ``pdist`` in milk namespace 122 | - Add Eigen to source distribution 123 | - Add measures.curves.roc 124 | - Add ``mds_dists`` function 125 | - Add ``verbose`` argument to milk.tests.run 126 | 127 | 128 | New in 0.5 (05 Nov 2012) 129 | ~~~~~~~~~~~~~~~~~~~~~~~~ 130 | - Add coordinate-descent based LASSO 131 | - Add unsupervised.center function 132 | - Make zscore work with NaNs (by ignoring them) 133 | - Propagate apply_many calls through transformers 134 | - Much faster SVM classification with means a much faster defaultlearner() 135 | [measured 2.5x speedup on yeast dataset!] 136 | 137 | 138 | For older versions, see ``ChangeLog`` file 139 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | 9 | # Internal variables. 10 | PAPEROPT_a4 = -D latex_paper_size=a4 11 | PAPEROPT_letter = -D latex_paper_size=letter 12 | ALLSPHINXOPTS = -d build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 13 | 14 | .PHONY: help clean html web pickle htmlhelp latex changes linkcheck 15 | 16 | help: 17 | @echo "Please use \`make ' where is one of" 18 | @echo " html to make standalone HTML files" 19 | @echo " pickle to make pickle files" 20 | @echo " json to make JSON files" 21 | @echo " htmlhelp to make HTML files and a HTML help project" 22 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 23 | @echo " changes to make an overview over all changed/added/deprecated items" 24 | @echo " linkcheck to check all external links for integrity" 25 | 26 | clean: 27 | -rm -rf build/* 28 | 29 | html: 30 | mkdir -p build/html build/doctrees 31 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) build/html 32 | @echo 33 | @echo "Build finished. The HTML pages are in build/html." 34 | 35 | pickle: 36 | mkdir -p build/pickle build/doctrees 37 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) build/pickle 38 | @echo 39 | @echo "Build finished; now you can process the pickle files." 40 | 41 | web: pickle 42 | 43 | json: 44 | mkdir -p build/json build/doctrees 45 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) build/json 46 | @echo 47 | @echo "Build finished; now you can process the JSON files." 48 | 49 | htmlhelp: 50 | mkdir -p build/htmlhelp build/doctrees 51 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) build/htmlhelp 52 | @echo 53 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 54 | ".hhp project file in build/htmlhelp." 55 | 56 | latex: 57 | mkdir -p build/latex build/doctrees 58 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) build/latex 59 | @echo 60 | @echo "Build finished; the LaTeX files are in build/latex." 61 | @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ 62 | "run these through (pdf)latex." 63 | 64 | changes: 65 | mkdir -p build/changes build/doctrees 66 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) build/changes 67 | @echo 68 | @echo "The overview file is in build/changes." 69 | 70 | linkcheck: 71 | mkdir -p build/linkcheck build/doctrees 72 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) build/linkcheck 73 | @echo 74 | @echo "Link check complete; look for any errors in the above output " \ 75 | "or in build/linkcheck/output.txt." 76 | -------------------------------------------------------------------------------- /docs/source/adaboost.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | AdaBoost 3 | ======== 4 | 5 | Adaboost 6 | -------- 7 | 8 | This example is available as part of milk as ``milk/demos/adaboost.py``. 9 | 10 | Adaboost is based on a weak learner. For this example, we are going to use a 11 | stump learner:: 12 | 13 | import milk.supervised.tree 14 | import milk.supervised.adaboost 15 | 16 | weak = milk.supervised.tree.stump_learner() 17 | learner = milk.supervised.adaboost.boost_learner(weak) 18 | 19 | Currently, only binary classification is implemented for ``boost_learner``. 20 | Therefore, we need to use a converter, in this case, using the *one versus one* 21 | strategy:: 22 | 23 | import milk.supervised.multi 24 | learner = milk.supervised.multi.one_against_one(learner) 25 | 26 | Now, we can use this learner as we would normally do. For example, for 27 | cross-validation:: 28 | 29 | from milksets import wine 30 | features, labels = wine.load() 31 | cmat,names,predictions = \ 32 | milk.nfoldcrossvalidation(features, \ 33 | labels, \ 34 | classifier=learner, \ 35 | return_predictions=True) 36 | 37 | We display just the first two dimensions here using circles for correct 38 | predictions and crosses for mis-matches. The colour represents the underlying 39 | class:: 40 | 41 | import pylab as plt 42 | colors = "rgb" 43 | codes = "xo" 44 | for y,x,r,p in zip(features.T[0], features.T[1], labels, predictions): 45 | code = codes[int(r == p)] 46 | plt.plot([y],[x], colors[p]+code) 47 | plt.show() 48 | 49 | .. plot:: ../../milk/demos/adaboost.py 50 | :include-source: 51 | 52 | API Documentation 53 | ----------------- 54 | 55 | .. automodule:: milk.supervised.adaboost 56 | :members: boost_learner 57 | 58 | -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | API Documentation 3 | ================= 4 | 5 | .. automodule:: milk 6 | :members: 7 | 8 | .. automodule:: milk.supervised 9 | :members: 10 | 11 | .. automodule:: milk.unsupervised 12 | :members: 13 | 14 | .. automodule:: milk.measures 15 | :members: 16 | 17 | .. automodule:: milk.utils 18 | :members: 19 | 20 | 21 | -------------------------------------------------------------------------------- /docs/source/benchmarks.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Benchmarks 3 | ========== 4 | 5 | Scikits.learn benchmark 6 | ----------------------- 7 | 8 | This is from a benchmark developed by the `scikits.learn team 9 | `__. I ran it on my Intel Core2 10 | 6600, 2.40GHz CPU. 11 | 12 | .. table:: Results in scikits.learn ml-benchmarks 13 | 14 | ============ ======= ====== ======= ======== ============= ======== 15 | Benchmark PyMVPA Shogun Pybrain MLPy scikits.learn milk 16 | ============ ======= ====== ======= ======== ============= ======== 17 | knn **1.0** 2.23 -- 2.23 3.05 2.20 18 | elasticnet -- -- -- 174.43 **1.0** -- 19 | lassolars -- -- -- 61.67 **1.0** -- 20 | pca -- -- -- -- **1.0** 11.11 21 | kmeans -- 2.02 7057.02 1.61 6.74 **1.0** 22 | svm 3.35 1.20 -- -- 1.24 **1.0** 23 | ============ ======= ====== ======= ======== ============= ======== 24 | 25 | 26 | All of the results are normalised by the fastest system for each entry (which 27 | is therefore, by definition, 1.0). 28 | 29 | So, except for PCA, milk *is pretty fast* and for kmeans and SVM learning it is 30 | the fastest system. 31 | 32 | Limitations of This Benchmark 33 | ----------------------------- 34 | 35 | 1. It is very small dataset, so you do not get a feeling of how it scales. 36 | 2. It is only one dataset. 37 | 3. Since the benchmark came out, I made some changes to milk to make it go 38 | faster. I hope that other systems do the same, though, so we can have good 39 | progress. 40 | 41 | -------------------------------------------------------------------------------- /docs/source/clustering.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Clustering 3 | ========== 4 | 5 | K-Means 6 | ------- 7 | 8 | K-means is one of the simplest, but often most effective, clustering 9 | algorithms. milk supports k-means through the ``milk.kmeans`` function: 10 | 11 | :: 12 | 13 | features = np.random.randn(100,20) 14 | features[:50] *= 2 15 | 16 | k = 2 17 | cluster_ids, centroids = milk.kmeans(features, k) 18 | 19 | The milk implementation is very fast and can handle large amounts of data. In 20 | an effort to make it scale to millions of data points, the author of milk even 21 | included new features in numpy. If you happen to run numpy 1.6 or newer, then 22 | milk will pick it up and run faster with less memory. 23 | 24 | Milk has been used to cluster datasets with over 5 million data points and over 25 | 100 features per data point. You need enough RAM to handle the data matrix and 26 | the distance matrix (NxK) and a little extra, but milk is very careful not to 27 | allocate any more memory than it needs. 28 | 29 | 30 | -------------------------------------------------------------------------------- /docs/source/examples.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Examples 3 | ======== 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | 8 | randomforests 9 | 10 | -------------------------------------------------------------------------------- /docs/source/extensions.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Extensions 3 | ========== 4 | 5 | An extension adds some functionality that is either not really core or requires 6 | extra dependencies. Currently, the only extension is a little `jug 7 | `__ based function for *parallel 8 | nfoldcrossvalidation*. 9 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/featureselection.rst: -------------------------------------------------------------------------------- 1 | =================================== 2 | Feature Normalisation and Selection 3 | =================================== 4 | 5 | For many problems, feature normalisation and selection is a 6 | 7 | Simple Normalisations 8 | --------------------- 9 | 10 | Fill in ``NaNs`` and ``Infs``: the ``checkfinite()`` learner does this. This 11 | learner does not use any of its input features: it always returns the same 12 | model. 13 | 14 | Whiten 15 | ------ 16 | 17 | Checkout the functions ``zscore()`` if you have a feature matrix or the 18 | ``zscore_normalise()`` learner. 19 | 20 | Stepwise Discriminant Analysis 21 | ------------------------------ 22 | 23 | Stepwise Discriminant Analysis (SDA) is a simple feature selection method. It 24 | is supervised and independent of the downstream classifier. 25 | 26 | **Important Note**: SDA does not work well if your features are linearly 27 | dependent. Filter out linearly dependent features before calling SDA (use 28 | ``linearly_dependent_features``). 29 | 30 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. include:: readme.rst 2 | 3 | Contents: 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | 8 | supervised 9 | nfoldcrossvalidation 10 | featureselection 11 | adaboost 12 | clustering 13 | randomnumbers 14 | examples 15 | parallel 16 | extensions 17 | benchmarks 18 | milksets 19 | api 20 | 21 | Indices and tables 22 | ================== 23 | 24 | * :ref:`genindex` 25 | * :ref:`search` 26 | 27 | 28 | -------------------------------------------------------------------------------- /docs/source/milksets.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Milksets 3 | ======== 4 | 5 | Milksets is a separate package that contains a few UCI datasets in a format 6 | that is easy to handle with milk. 7 | 8 | It is mostly useful for testing, playing around. 9 | 10 | 11 | You can install it from pypi with:: 12 | 13 | pip install milksets 14 | 15 | or:: 16 | 17 | easy_install milksets 18 | 19 | Links 20 | ----- 21 | 22 | - `github `__ 23 | - `homepage `__ 24 | 25 | -------------------------------------------------------------------------------- /docs/source/nfoldcrossvalidation.rst: -------------------------------------------------------------------------------- 1 | ================ 2 | Cross-validation 3 | ================ 4 | 5 | Cross validation is one of the better ways to evaluate the performance of 6 | supervised classification. 7 | 8 | Cross validation consists of separating the data into fold (hence the name 9 | _n_-fold cross-validation, where _n_ is a positive integer). For the purpose o 10 | this discussion, we consider 10 folds. In the first round, we leave the first 11 | fold out. This means we train on the other 9 folds and then evaluate the model 12 | on this left-out fold. On the second round, we leave the second fold out. This 13 | continues until every fold has been left out exactly once. 14 | 15 | Milk support what is often explicitly called *stratified cross validation*, 16 | which means that it takes the class distributions into account (so that, in 10 17 | fold cross validation, each fold will have 10% of each class per round). 18 | 19 | An additional functionality, not normally found in machine learning packages or 20 | in machine learning theory, but very useful in practice is the use of the 21 | ``origins`` parameter. Every datapoint can have an associated *origin*. This is 22 | a an integer and its meaning is the following: all examples with the same 23 | origin will be in the same fold (so testing will never be performed where there 24 | was an object of the same origin used for training). 25 | 26 | This can model cases such as the following: you have collected patient data, 27 | which includes both some health measurement and an outcome of interest (for 28 | example, how the patient was doing a year after the initial exam). You wish to 29 | evaluate a supervised classification algorithm for predicting outcomes. In 30 | particular, you wish for an estimate of how well the system would perform on 31 | patients in any location (you know that the data collection has some site 32 | effects, perhaps because each person runs the test a little bit differently). 33 | Fortunately, you have the data to test this: the patients come from several 34 | clinics. Now, you set each patient origin to be the ID of the clinic and 35 | evaluate the per patient accuracy. 36 | 37 | 38 | API Documentation 39 | ----------------- 40 | 41 | .. automodule:: milk.measures.nfoldcrossvalidation 42 | :members: 43 | 44 | -------------------------------------------------------------------------------- /docs/source/parallel.rst: -------------------------------------------------------------------------------- 1 | =================== 2 | Parallel Processing 3 | =================== 4 | 5 | .. versionadded:: 0.3.10 6 | Jug integration was added in version 0.3.10. Parallel processing was added 7 | with 0.4.0 8 | 9 | There is certain functionality in milk which is *embarassingly parallel* (or 10 | almost so). Therefore, milk has some support for using multiprocessors and 11 | computing clusters. 12 | 13 | Jug Integration 14 | --------------- 15 | 16 | .. versionadded:: 0.3.10 17 | Jug integration requires `jug `__ 18 | 19 | Currently, there is support for running n-fold crossvalidation as multiple jug 20 | tasks, which jug can then partition across multiple processors (or computers in 21 | a cluster). 22 | 23 | Example 24 | ~~~~~~~ 25 | 26 | :: 27 | from milk.ext.jugparallel import nfoldcrossvalidation 28 | 29 | # For this example, we rely on milksets 30 | from milksets.wine import load 31 | 32 | # Load the data 33 | features, labels = load() 34 | 35 | cmatrix = nfoldcrossvalidation(features, labels) 36 | 37 | 38 | Save this as ``example.py`` and, now, you can run ``jug execute example.py`` to 39 | perform 10-fold cross-validation. Each fold will be its own Task and can be run 40 | independently of the others. 41 | 42 | Multiprocessing 43 | --------------- 44 | 45 | .. versionadded:: 0.4 46 | 47 | There are some opportunities for parallel processing which are hard to fit into 48 | the Jug framework (which is limited to coarse grained parallelisation). For 49 | example, choosing the parameters of a learner (e.g., the SVM learner) through 50 | cross-validation, has a high degree of parallelisation, but is hard to fit into 51 | the jug framework without (1) restructuring the code and (2) doing unnecessary 52 | computation. 53 | 54 | Therefore, milk can use multiple processes for this operation, using the Python 55 | ``multiprocessing`` module. 56 | 57 | Currently, by default, *this functionality is disabled.* Change the value of 58 | ``milk.utils.parallel.max_procs`` to enable it. 59 | 60 | Over time, more functionality will take advantage of multiple cores. 61 | 62 | Example 63 | ~~~~~~~ 64 | 65 | This is a simple example, which relies on `milksets 66 | `__ just for convenience (you could use 67 | any other labeled feature set. 68 | 69 | As you can see, you do not have to do anything except call 70 | ``milk.utils.parallel.set_max_procs()`` to enable multiprocessing (calling it 71 | without an argument sets the number of processes to the number of CPUs). 72 | 73 | :: 74 | 75 | import milk 76 | 77 | # Import the parallel module 78 | from milk.utils import parallel 79 | 80 | # For this example, we rely on milksets 81 | from milksets.wine import load 82 | 83 | # Use all available processors 84 | parallel.set_max_processors() 85 | 86 | # Load the data 87 | features, labels = load() 88 | learner = milk.defaultlearner() 89 | model = learn.train(features[::2], labels[::2]) 90 | held_out = map(model.apply, features[1::2]) 91 | print np.mean(labels[1::2] == held_out) 92 | 93 | 94 | Naturally, you can combine both of these features:: 95 | 96 | from milk.ext.jugparallel import nfoldcrossvalidation 97 | # Import the parallel module 98 | from milk.utils import parallel 99 | 100 | # For this example, we rely on milksets 101 | from milksets.wine import load 102 | 103 | # Use all available processors 104 | parallel.set_max_processors() 105 | 106 | # Load the data 107 | features, labels = load() 108 | 109 | cmatrix = nfoldcrossvalidation(features, labels) 110 | 111 | This is now a jug script which uses all available processors. This is ideal if 112 | you have a cluster of machines with multiple cores per machine. You can run 113 | different folds on different machines and, internally, each fold will use all 114 | the cores on its machine. 115 | 116 | Naturally, if you run multiple folds on the same machine, they will end up 117 | fighting for the same cores and you will get no speedup. 118 | 119 | -------------------------------------------------------------------------------- /docs/source/principles.rst: -------------------------------------------------------------------------------- 1 | Principles of Milk 2 | ------------------- 3 | 4 | Play Well With Others 5 | ~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | This is the basic principle of milk: it should play well with others. It means 8 | that its interfaces should, as much as possible, be flexible. 9 | 10 | Be Liberal With What you Accept. Be Conservative With What Your Produce 11 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 12 | 13 | Don't be fussy about input parameters, but specified very careful your outputs. 14 | 15 | Work Interactively 16 | ~~~~~~~~~~~~~~~~~~ 17 | 18 | This means that building a classifier should look like this:: 19 | 20 | classifier = milk.default_classifier(data,labels) 21 | 22 | and not like this:: 23 | 24 | classifier = milk.concattransforms( 25 | milk.chkfinite(), 26 | milk.to_interval(1,-1), 27 | milk.pick_best(f=0.10), 28 | milk.binary_to_multi(mode='1-vs-1', 29 | base=milk.supervised.gridsearch( 30 | base=milk.svm_binary(base=milk.svm_libsvm()), 31 | params={ 32 | 'C' : [2**c for c in xrange(-7,4)], 33 | 'kernel' : [milk.rbf_kernel(2**w) for w in xrange(-4,2)]))) 34 | container = milk.container() 35 | for col in len(data[0]): 36 | container.set_column(col,milk.CONTINUOUS) 37 | container.set_data(data) 38 | labelcontainer = milk.labelcontainer() 39 | labelcontainer.set_type(milk.STRING) 40 | labelcontainer.set_data(labels) 41 | 42 | classifier.train(container,labelcontainer) 43 | 44 | This often means that one might have a more complete interface internally and 45 | another interface for interactive use on top (see Matplotlib_ for a good 46 | example of this). 47 | 48 | .. _Matplotlib: http://matplotlib.sourceforge.net/ 49 | 50 | 51 | Don't Impose Yourself 52 | ~~~~~~~~~~~~~~~~~~~~~ 53 | 54 | Don't assume that people are writing their software around your library, which 55 | translates into: 56 | 57 | * Don't impose your file format. 58 | * Don't impose your in-memory data format. 59 | 60 | Be Pythonic 61 | ~~~~~~~~~~~ 62 | 63 | In general, be a true Python library (and not just a wrapper around something 64 | else). For example: 65 | 66 | - If an SVM classifier takes a kernel as a parameter, then it should accept any 67 | 2-argument Python function (in fact, anything that's callable in Python). 68 | - Objects (like classifiers) should be pickle-able. 69 | 70 | You Don't Pay For What You Don't Use 71 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 72 | 73 | Flexibility should come with the lowest-possible cost. If a cost is 74 | unavoidable, it should be paid by those who use the flexibility and not by 75 | everybody else. 76 | 77 | -------------------------------------------------------------------------------- /docs/source/randomforests.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | Using Random Forests 3 | ==================== 4 | 5 | If you are not familiar with random forests, in general, `Wikipedia 6 | `__ is a good place to start 7 | reading. The current article deals only with how to use them in **milk**. 8 | 9 | Random forests as implemented in milk are *binary classifiers*, so you need to 10 | use a transformer to turn them into multi-class learners if you have 11 | multi-class data. 12 | 13 | :: 14 | 15 | from milk.supervised import randomforest 16 | from milk.supervised.multi import one_against_one 17 | 18 | rf_learner = randomforest.rf_learner() 19 | learner = one_against_one(rf_learner) 20 | 21 | This is just another learner type, which we can use to train a model:: 22 | 23 | from milksets import wine 24 | features, labels = wine.load() 25 | model = learner.train(features, labels) 26 | 27 | or to perform cross-validation:: 28 | 29 | cmat,names, preds = milk.nfoldcrossvalidation(features, labels, classifier=learner, return_predictions=1) 30 | 31 | If you have `milksets `__ installed, you can try it on one of its datasets:: 32 | 33 | from milksets import wine 34 | features, labels = wine.load() 35 | cmat,names, preds = milk.nfoldcrossvalidation(features, labels, classifier=learner, return_predictions=1) 36 | 37 | We can finally plot the results (mapped to 2 dimensions using PCA): 38 | 39 | .. plot:: ../../milk/demos/rf_wine_2d.py 40 | :include-source: 41 | 42 | Colours indicate the classification output. A circle means that it matches the 43 | underlying label, a cross that it was a mis-classification. 44 | 45 | -------------------------------------------------------------------------------- /docs/source/randomnumbers.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Random Numbers 3 | ============== 4 | How milk handles random numbers 5 | ------------------------------- 6 | 7 | Many algorithms (e.g., `kmeans`) require random number initialisation. 8 | 9 | In `milk`, all functions that internally use random numbers take an `R` 10 | parameter. If left unspecified (or set to `None`), then it means that the 11 | internal initialisation should be used. 12 | 13 | `R` can be specified by an integer, a `random.Random` instance, or a 14 | `numpy.RandomState` instance. If the same `R` is passed twice to the function, 15 | then the results are deterministic. 16 | 17 | Functions that use random numbers 18 | --------------------------------- 19 | 20 | - `kmeans`: for initial cluster choice. 21 | - `repeated_kmeans`: for use in `kmeans` internally. 22 | - `som`: for initial choice of points. 23 | - `nnmf` and `sparse_nnmf`: for initialisation. 24 | 25 | ``random`` and ``numpy.random`` 26 | ------------------------------- 27 | 28 | There are two randomness mechanisms used internally by `milk`: `random` (the 29 | standard Python package) and `numpy.random`. Setting the seed on just one of 30 | them will not be enough. You need to set *both*. This is in alternative to 31 | using the `R` technique outlined above. 32 | 33 | -------------------------------------------------------------------------------- /docs/source/readme.rst: -------------------------------------------------------------------------------- 1 | ============================== 2 | MILK: MACHINE LEARNING TOOLKIT 3 | ============================== 4 | Machine Learning in Python 5 | -------------------------- 6 | 7 | Milk is a machine learning toolkit in Python. 8 | 9 | Its focus is on supervised classification with several classifiers available: 10 | SVMs (based on libsvm), k-NN, random forests, decision trees. It also performs 11 | feature selection. These classifiers can be combined in many ways to form 12 | different classification systems. 13 | 14 | For unsupervised learning, milk supports k-means clustering and affinity 15 | propagation. 16 | 17 | Milk is flexible about its inputs. It optimised for numpy arrays, but can often 18 | handle anything (for example, for SVMs, you can use any dataype and any kernel 19 | and it does the right thing). 20 | 21 | There is a strong emphasis on speed and low memory usage. Therefore, most of 22 | the performance sensitive code is in C++. This is behind Python-based 23 | interfaces for convenience. 24 | 25 | Features 26 | -------- 27 | - Random forests 28 | - Self organising maps 29 | - SVMs. Using the libsvm solver with a pythonesque wrapper around it. 30 | - Stepwise Discriminant Analysis for feature selection. 31 | - Non-negative matrix factorisation 32 | - K-means using as little memory as possible. 33 | - Affinity propagation 34 | 35 | License: MIT 36 | Author: Luis Pedro Coelho (with code from LibSVM and scikits.learn) 37 | Website: `http://luispedro.org/software/milk 38 | `__ 39 | API Documentation: `http://packages.python.org/milk/ `_ 40 | -------------------------------------------------------------------------------- /docs/source/supervised.rst: -------------------------------------------------------------------------------- 1 | ========================= 2 | Supervised Classification 3 | ========================= 4 | 5 | Supervised learning takes in both a set of *input features* and their 6 | corresponding *labels* to produce a model which can then be fed an unknown 7 | instance and produce a label for it. 8 | 9 | Typical supervised learning models are SVMs and decision trees. 10 | 11 | Example 12 | ------- 13 | :: 14 | 15 | features = np.random.randn(100,20) 16 | features[:50] *= 2 17 | labels = np.repeat((0,1), 50) 18 | 19 | classifier = milk.defaultclassifier() 20 | model = classifier.train(features, labels) 21 | new_label = model.apply(np.random.randn(20)) 22 | new_label2 = model.apply(np.random.randn(20)*2) 23 | 24 | Learners 25 | -------- 26 | 27 | All learners have a ``train`` function which takes 2 at least arguments: 28 | - features : sequence of features 29 | - labels : sequence of labels 30 | 31 | (They may take more parameters). 32 | 33 | They return a *model* object, which has an ``apply`` function which takes a 34 | single input and returns its label. 35 | 36 | Note that there are always two objects: the learned and the model and they are 37 | independent. Every time you call ``learner.train()`` you get a new model. This 38 | is different from the typical interface where you first call ``train()`` and 39 | later ``apply()`` (or equivalent names) on the same object. This is a better 40 | interface because the type system protects you against calling ``apply()`` on 41 | the wrong object and because it often the case that you want to learn several 42 | models with the same learner. The only disadvantage is that the word 43 | *classifier* can be used for both, so in the documentation, we always refer to 44 | *models* and *classifiers.* 45 | 46 | Both learners and models are pickle()able. 47 | 48 | Composition and Defaults 49 | ------------------------ 50 | 51 | The style of milk involves many small objects,each providing one step of the 52 | pipeline. For example: 53 | 54 | 1. remove NaNs and Infs from features 55 | 2. bring features to the [-1, 1] interval 56 | 3. feature selection by removing linearly dependent features and then SDA 57 | 4. one-vs-rest classifier based on a grid search for parameters for an svm 58 | classifier 59 | 60 | To get this you can use:: 61 | 62 | classifier = ctransforms( 63 | chkfinite(), 64 | interval_normalise(), 65 | featureselector(linear_independent_features), 66 | sda_filter(), 67 | gridsearch(one_against_one(svm.svm_to_binary(svm.svm_raw())), 68 | params={ 69 | 'C': 2.**np.arange(-9,5), 70 | 'kernel': [svm.rbf_kernel(2.**i) for i in np.arange(-7,4)], 71 | } 72 | )) 73 | 74 | As you can see, this is very flexible, but can be tedious. Therefore, milk 75 | provides the above as a single function call: ``defaultclassifier()`` 76 | 77 | 78 | supervised Submodules 79 | --------------------- 80 | 81 | - defaultclassifier: contains a default "good enough" classifier 82 | - svm: related to SVMs 83 | - adaboost: Adaboost 84 | - randomforest: random forests 85 | - grouped: contains objects to transform single object learners into group 86 | learners by voting 87 | - multi: transforms binary learners into multi-class learners (1-vs-1 or 88 | 1-vs-rest) 89 | - featureselection: feature selection 90 | - knn: k-nearest neighbours 91 | - tree: decision tree learners 92 | 93 | -------------------------------------------------------------------------------- /get-eigen.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | wget http://bitbucket.org/eigen/eigen/get/3.1.2.tar.bz2 3 | tar xjf 3.1.2.tar.bz2 4 | cd eigen-eigen-5097c01bcdc4 5 | mkdir -p ../milk/supervised/eigen3 6 | cp -r Eigen ../milk/supervised/eigen3 7 | 8 | -------------------------------------------------------------------------------- /milk/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2015, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | ''' 24 | Milk 25 | 26 | Machine learning in Python 27 | 28 | Toplevel functions 29 | ------------------ 30 | - nfoldcrossvalidation: n-fold crossvalidation 31 | - defaultclassifier: get a general purpose classifier 32 | - kmeans: kmeans clustering 33 | 34 | Modules 35 | ------- 36 | - supervised 37 | - unsupervised 38 | - measures 39 | 40 | Example 41 | ------- 42 | 43 | :: 44 | 45 | features = np.random.randn(100,20) 46 | features[:50] *= 2 47 | labels = np.repeat((0,1), 50) 48 | 49 | classifier = milk.defaultclassifier() 50 | model = classifier.train(features, labels) 51 | new_label = model.apply(np.random.randn(100)) 52 | new_label2 = model.apply(np.random.randn(100)*2) 53 | 54 | ''' 55 | 56 | try: 57 | from .nfoldcrossvalidation import nfoldcrossvalidation 58 | from .supervised.defaultclassifier import defaultclassifier 59 | from .supervised.defaultlearner import defaultlearner 60 | from .unsupervised.kmeans import kmeans 61 | from .unsupervised import pdist, zscore, pca 62 | from .milk_version import __version__ 63 | except ImportError as e: 64 | import sys 65 | sys.stderr.write('''\ 66 | Could not import submodules (exact error was: {}). 67 | 68 | There are many reasons for this error the most common one is that you have 69 | either not built the packages or have built (using `python setup.py build`) or 70 | installed them (using `python setup.py install`) and then proceeded to test 71 | milk **without changing the current directory**. 72 | 73 | Try installing and then changing to another directory before importing milk. 74 | '''.format(e)) 75 | 76 | __all__ = [ 77 | '__version__', 78 | 'kmeans', 79 | 'pdist', 80 | 'zscore', 81 | 'defaultclassifier', 82 | 'defaultlearner', 83 | 'nfoldcrossvalidation', 84 | ] 85 | -------------------------------------------------------------------------------- /milk/active/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/milk/abc2a28b526c199414d42c0a26092938968c3caf/milk/active/__init__.py -------------------------------------------------------------------------------- /milk/active/eimpact.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2010, Luis Pedro Coelho 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | 22 | from __future__ import division 23 | import numpy 24 | from ..supervised import svm 25 | from ..supervised.classifier import ctransforms 26 | 27 | 28 | def expected_impacts(D,labels,U): 29 | ''' 30 | EIs = expected_impacts(D,labels,U) 31 | 32 | Compute Expected impact for each element of U 33 | 34 | Eis[i]: P(label[i] == 1) * IMPACT(label[i] == 1) + P(label[i] == 0) * IMPACT(label[i] == 0) 35 | ''' 36 | assert len(D) == len(labels), 'Nr of labeled examples should match lenght of labels vector' 37 | 38 | K = svm.rbf_kernel(20000) 39 | prob_classifier = ctransforms(svm.svm_raw(kernel=K,C=4),svm.svm_sigmoidal_correction()) 40 | label_classifier = ctransforms(svm.svm_raw(kernel=K,C=4),svm.svm_binary()) 41 | 42 | prob_classifier.train(D,labels) 43 | u_probs = prob_classifier(U) 44 | u_labels = (u_probs > .5) 45 | impacts = [] 46 | for u,p in zip(U,u_probs): 47 | print(len(impacts)) 48 | label_classifier.train(numpy.vstack((D,u)),numpy.hstack((labels,[0]))) 49 | u_labels_0 = label_classifier(U) 50 | 51 | label_classifier.train(numpy.vstack((D,u)),numpy.hstack((labels,[1]))) 52 | u_labels_1 = label_classifier(U) 53 | 54 | e_impact = (1.-p)*(u_labels != u_labels_0).sum() + p*(u_labels != u_labels_1).sum() 55 | 56 | impacts.append(e_impact) 57 | return impacts 58 | 59 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 60 | -------------------------------------------------------------------------------- /milk/active/uncertainty.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2010, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | ''' 24 | Uncertainty 25 | ============ 26 | 27 | Implements uncertainty-based active learning strategies. 28 | 29 | These are strategies that are based on querying those elements in the pool 30 | which we are most uncertain about 31 | 32 | Functions 33 | ---------- 34 | * entropy 35 | * one_minus_max 36 | ''' 37 | 38 | from __future__ import division 39 | import numpy 40 | 41 | def entropy(model, pool): 42 | ''' 43 | entropies = entropy(model, pool) 44 | 45 | Returns the entropy of each classification output for 46 | members in the pool. 47 | ''' 48 | def _entropy(labels, ps): 49 | H = 0. 50 | for p in ps: 51 | if p > 1e-9: 52 | H += p * np.log(p) 53 | return H 54 | return [_entropy(model.apply(u)) for u in pool] 55 | 56 | def one_minus_max(model,pool): 57 | ''' 58 | oneminus = one_minus_max(model,pool) 59 | 60 | oneminus[i] = 1 - max_L { P(pool_i == L) } 61 | 62 | Returns one minus the probability for the best label guess. 63 | ''' 64 | def _minus1(labels, ps): 65 | return 1. - np.max(ps) 66 | return [_minus1(model.apply(u)) for u in pool] 67 | 68 | 69 | -------------------------------------------------------------------------------- /milk/demos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/milk/abc2a28b526c199414d42c0a26092938968c3caf/milk/demos/__init__.py -------------------------------------------------------------------------------- /milk/demos/adaboost.py: -------------------------------------------------------------------------------- 1 | import pylab as plt 2 | import milk.supervised.tree 3 | import milk.supervised.adaboost 4 | from milksets import wine 5 | import milk.supervised.multi 6 | 7 | weak = milk.supervised.tree.stump_learner() 8 | learner = milk.supervised.adaboost.boost_learner(weak) 9 | learner = milk.supervised.multi.one_against_one(learner) 10 | 11 | features, labels = wine.load() 12 | cmat,names,predictions = milk.nfoldcrossvalidation(features,labels, classifier=learner, return_predictions=True) 13 | colors = "rgb" 14 | codes = "xo" 15 | for y,x,r,p in zip(features.T[0], features.T[1], labels, predictions): 16 | code = codes[int(r == p)] 17 | plt.plot([y],[x], colors[p]+code) 18 | plt.show() 19 | 20 | -------------------------------------------------------------------------------- /milk/demos/rf_wine_2d.py: -------------------------------------------------------------------------------- 1 | from milk.supervised import randomforest 2 | from milk.supervised.multi import one_against_one 3 | import milk.nfoldcrossvalidation 4 | import milk.unsupervised 5 | 6 | import pylab 7 | from milksets import wine 8 | 9 | # Load 'wine' dataset 10 | features, labels = wine.load() 11 | # random forest learner 12 | rf_learner = randomforest.rf_learner() 13 | # rf is a binary learner, so we transform it into a multi-class classifier 14 | learner = one_against_one(rf_learner) 15 | 16 | # cross validate with this learner and return predictions on left-out elements 17 | cmat,names, preds = milk.nfoldcrossvalidation(features, labels, classifier=learner, return_predictions=1) 18 | 19 | print('cross-validation accuracy:', cmat.trace()/float(cmat.sum())) 20 | 21 | # dimensionality reduction for display 22 | x,v = milk.unsupervised.pca(features) 23 | colors = "rgb" # predicted colour 24 | marks = "xo" # whether the prediction was correct 25 | for (y,x),p,r in zip(x[:,:2], preds, labels): 26 | c = colors[p] 27 | m = marks[p == r] 28 | pylab.plot(y,x,c+m) 29 | pylab.show() 30 | 31 | -------------------------------------------------------------------------------- /milk/demos/svm-decision-boundary.py: -------------------------------------------------------------------------------- 1 | from pylab import * 2 | import numpy as np 3 | 4 | from milksets.wine import load 5 | import milk.supervised 6 | import milk.unsupervised.pca 7 | import milk.supervised.svm 8 | 9 | features, labels = load() 10 | features = features[labels < 2] 11 | labels = labels[labels < 2] 12 | features,_ = milk.unsupervised.pca(features) 13 | features = features[:,:2] 14 | learner = milk.supervised.svm.svm_raw(kernel=np.dot, C=12) 15 | model = learner.train(features, labels) 16 | w = np.dot(model.svs.T, model.Yw) 17 | b = model.b 18 | x = np.linspace(-.5, .1, 100) 19 | y = -w[0]/w[1]*x + b/w[1] 20 | plot(features[labels == 1][:,0], features[labels == 1][:,1], 'bx') 21 | plot(features[labels == 0][:,0], features[labels == 0][:,1], 'ro') 22 | plot(x,y) 23 | savefig('svm-demo-points.pdf') 24 | 25 | clf() 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | learner = milk.supervised.svm.svm_raw(kernel=milk.supervised.svm.rbf_kernel(1.), C=12) 34 | model = learner.train(features, labels) 35 | Y, X = (np.mgrid[:101,:101]-50)/12.5 36 | values = [model.apply((y,x)) for y,x in zip(Y.ravel(),X.ravel())] 37 | values = np.array(values).reshape(Y.shape) 38 | sfeatures = features*12.5 39 | sfeatures += 50 40 | plot(sfeatures[labels == 0][:,0], sfeatures[labels == 0][:,1], 'bo') 41 | plot(sfeatures[labels == 1][:,0], sfeatures[labels == 1][:,1], 'ro') 42 | imshow(values.T) 43 | savefig('svm-demo-boundary.pdf') 44 | 45 | 46 | -------------------------------------------------------------------------------- /milk/ext/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | =============== 3 | Milk Extensions 4 | =============== 5 | 6 | These are modules whose functionality is not really part of the core 7 | functionality of milk, but which are useful with it. 8 | ''' 9 | 10 | -------------------------------------------------------------------------------- /milk/measures/__init__.py: -------------------------------------------------------------------------------- 1 | from .measures import accuracy, waccuracy, zero_one_loss, confusion_matrix, bayesian_significance 2 | -------------------------------------------------------------------------------- /milk/measures/cluster_agreement.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2011, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # License: MIT. See COPYING.MIT file in the milk distribution 5 | 6 | from __future__ import division 7 | import numpy as np 8 | 9 | def rand_arand_jaccard(recovered, labels): 10 | ''' 11 | rand, a_rand, jaccard = rand_arand_jaccard(recovered, labels) 12 | 13 | Compute Rand, Adjusted Rand, and Jaccard indices 14 | 15 | These share most of the computation. Therefore, it is best to compute them 16 | together even if you are only going to use some. 17 | 18 | Parameters 19 | ---------- 20 | recovered : sequence of int 21 | The recovered clusters 22 | labels : sequence of int 23 | Underlying labels 24 | 25 | Returns 26 | ------- 27 | rand : float 28 | Rand index 29 | a_rand : float 30 | Adjusted Rand index 31 | jaccard : float 32 | Jaccard index 33 | 34 | References 35 | ---------- 36 | http://en.wikipedia.org/wiki/Rand_index 37 | http://en.wikipedia.org/wiki/Jaccard_index 38 | ''' 39 | 40 | from scipy.misc import comb 41 | recovered = np.asanyarray(recovered) 42 | labels = np.asanyarray(labels) 43 | contig,_,_ = np.histogram2d(recovered, labels,np.arange(max(recovered.max()+2,labels.max()+2))) 44 | A_0 = contig.sum(0) 45 | A_1 = contig.sum(1) 46 | Ai2 = np.sum(A_0*(A_0-1)/2.) 47 | Bi2 = np.sum(A_1*(A_1-1)/2.) 48 | n = A_0.sum() 49 | 50 | a = comb(contig.ravel(), 2).sum() 51 | b = comb(A_0, 2).sum()-a 52 | c = comb(A_1, 2).sum()-a 53 | d = comb(n, 2)-a-b-c 54 | rand = (a+d)/(a+b+c+d) 55 | jaccard = (a+d)/(b+c+d) 56 | 57 | index = np.sum(contig*(contig-1)/2) 58 | expected = Ai2*Bi2/n/(n-1)*2. 59 | maxindex = (Ai2+Bi2)/2. 60 | a_rand = (index-expected)/(maxindex-expected) 61 | 62 | return rand, a_rand, jaccard 63 | 64 | -------------------------------------------------------------------------------- /milk/measures/curves.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2011-2013, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # License: MIT. See COPYING.MIT file in the milk distribution 5 | 6 | from __future__ import division 7 | import numpy as np 8 | 9 | def precision_recall(values, labels, mode='all', nr_steps=100): 10 | ''' 11 | precision, recall = precision_recall(values, labels, mode='all', nr_steps=100) 12 | plot(precision, recall) 13 | 14 | Compute a precision-recall curve. 15 | 16 | For a given threshold ``T``, consider that the positions where ``values >= 17 | T`` are classified as True. Precision is defined as ``TP/(TP+FP)``, while 18 | recall is defined as ``TP/(TP+FN)``. 19 | 20 | Parameters 21 | ---------- 22 | values : sequence of numbers 23 | labels : boolean sequence 24 | mode : str, optional 25 | Which thresholds to consider. Either 'all' (i.e., use all values of 26 | `values` as possible thresholds), or 'step' (using `nr_steps` 27 | equidistant points from ``min(values)`` to ``max(values)``) 28 | nr_steps : integer, optional 29 | How many steps to use. Only meaningfule if ``mode == 'steps'`` 30 | 31 | Returns 32 | ------- 33 | precision : a sequence of floats 34 | recall : a sequence of floats 35 | 36 | Actually, ``2 x P`` array is returned. 37 | ''' 38 | 39 | values = np.asanyarray(values) 40 | labels = np.asanyarray(labels) 41 | if len(values) != len(labels): 42 | raise ValueError('milk.measures.precision_recall: `values` must be of same length as `labels`') 43 | if mode == 'all': 44 | points = list(set(values)) 45 | points.sort() 46 | elif mode == 'steps': 47 | points = np.linspace(values.min(), values.max(), nr_steps) 48 | else: 49 | raise ValueError('milk.measures.precision_recall: cannot handle mode: `%s`' % mode) 50 | true_pos = float(np.sum(labels)) 51 | precision_recall = np.empty((len(points),2), np.float) 52 | 53 | for i,p in enumerate(points): 54 | selected = (values >= p) 55 | selected = labels[selected] 56 | precision_recall[i] = (np.mean(selected), np.sum(selected)/true_pos) 57 | return precision_recall.T 58 | 59 | def roc(values, labels, mode='all', nr_steps=100): 60 | ''' 61 | fpr, tpr = roc(values, labels, mode='all', nr_steps=100) 62 | plot(fpr, tpr) 63 | 64 | Compute a ROC curve 65 | 66 | For a given threshold ``T``, consider that the positions where ``values >= 67 | T`` are classified as True. Precision is defined as ``TP/(TP+FP)``, while 68 | recall is defined as ``TP/(TP+FN)``. 69 | 70 | Parameters 71 | ---------- 72 | values : sequence of numbers 73 | labels : boolean sequence 74 | mode : str, optional 75 | Which thresholds to consider. Either 'all' (i.e., use all values of 76 | `values` as possible thresholds), or 'step' (using `nr_steps` 77 | equidistant points from ``min(values)`` to ``max(values)``) 78 | nr_steps : integer, optional 79 | How many steps to use. Only meaningfule if ``mode == 'steps'`` 80 | 81 | Returns 82 | ------- 83 | precision : a sequence of floats 84 | recall : a sequence of floats 85 | 86 | Actually, ``2 x P`` array is returned. 87 | ''' 88 | values = np.asanyarray(values) 89 | labels = np.asanyarray(labels) 90 | if len(values) != len(labels): 91 | raise ValueError('milk.measures.roc: `values` must be of same length as `labels`') 92 | if mode == 'all': 93 | points = list(set(values)) 94 | points.sort() 95 | elif mode == 'steps': 96 | points = np.linspace(values.min(), values.max(), nr_steps) 97 | else: 98 | raise ValueError('milk.measures.roc: cannot handle mode: `%s`' % mode) 99 | roc = np.empty((len(points),2), np.float) 100 | P = float(np.sum(labels)) 101 | N = len(labels)-P 102 | for i,p in enumerate(reversed(points)): 103 | selected = labels[values >= p] 104 | roc[i] = (np.sum(~selected)/N, np.sum(selected)/P) 105 | return roc.T 106 | 107 | -------------------------------------------------------------------------------- /milk/milk_version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.6.1' 2 | -------------------------------------------------------------------------------- /milk/nfoldcrossvalidation.py: -------------------------------------------------------------------------------- 1 | from .measures.nfoldcrossvalidation import foldgenerator, getfold, nfoldcrossvalidation 2 | -------------------------------------------------------------------------------- /milk/supervised/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2012, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # License: MIT. See COPYING.MIT file in the milk distribution 6 | 7 | ''' 8 | milk.supervised 9 | 10 | This hold the supervised classification modules: 11 | 12 | Submodules 13 | ---------- 14 | 15 | - defaultclassifier: contains a default "good enough" classifier 16 | - svm: related to SVMs 17 | - grouped: contains objects to transform single object classifiers into group classifiers 18 | by voting 19 | - multi: transforms binary classifiers into multi-class classifiers (1-vs-1 or 1-vs-rest) 20 | - featureselection: feature selection 21 | - knn: k-nearest neighbours 22 | - tree: decision tree classifiers 23 | 24 | Classifiers 25 | ----------- 26 | 27 | All classifiers have a `train` function which takes 2 arguments: 28 | - features : sequence of features 29 | - labels : sequence of labels 30 | They return a `model` object, which has an `apply` function which takes a 31 | single input and returns its label. 32 | 33 | Note that there are always two objects: the learned and the model and they are 34 | independent. Every time you call learner.train() you get a new model. 35 | 36 | Both classifiers and models are pickle()able. 37 | 38 | Example 39 | ------- 40 | :: 41 | 42 | features = np.random.randn(100,20) 43 | features[:50] *= 2 44 | labels = np.repeat((0,1), 50) 45 | 46 | classifier = milk.defaultclassifier() 47 | model = classifier.train(features, labels) 48 | new_label = model.apply(np.random.randn(100)) 49 | new_label2 = model.apply(np.random.randn(100)*2) 50 | ''' 51 | 52 | from .defaultclassifier import defaultclassifier, svm_simple 53 | from .classifier import normaliselabels 54 | from .gridsearch import gridsearch 55 | from .tree import tree_learner 56 | from .lasso import lasso, lasso_learner, lasso_model_walk, lasso_walk 57 | 58 | __all__ = [ 59 | 'normaliselabels', 60 | 'defaultclassifier', 61 | 'svm_simple', 62 | 'gridsearch', 63 | 'lasso', 64 | 'lasso_learner', 65 | 'lasso_model_walk', 66 | 'lasso_walk', 67 | 'tree_learner', 68 | ] 69 | -------------------------------------------------------------------------------- /milk/supervised/_perceptron.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2011-2015, Luis Pedro Coelho 2 | // License: MIT 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "../utils/utils.h" 9 | extern "C" { 10 | #include 11 | #include 12 | } 13 | 14 | 15 | namespace { 16 | 17 | template 18 | int perceptron(PyArrayObject* data_arr, const long* labels, PyArrayObject* weights_arr, double eta) { 19 | const T* data = reinterpret_cast(PyArray_DATA(data_arr)); 20 | T* weights = reinterpret_cast(PyArray_DATA(weights_arr)); 21 | const int N0 = PyArray_DIM(data_arr, 0); 22 | const int N1 = PyArray_DIM(data_arr, 1); 23 | int nr_errors = 0; 24 | for (int i = 0; i != N0; ++i, data += N1, ++labels) { 25 | T val = weights[0]; 26 | for (int j = 0; j != N1; ++j) { 27 | val += weights[j+1] * data[j]; 28 | } 29 | int ell = (val > 0); 30 | if (ell != *labels) { 31 | int pm = (*labels ? +1 : -1); 32 | ++nr_errors; 33 | T error = pm * eta * std::abs(pm-val); 34 | weights[0] += error; 35 | for (int j = 0; j != N1; ++j) { 36 | weights[j+1] += error*data[j]; 37 | } 38 | } 39 | } 40 | return nr_errors; 41 | } 42 | 43 | PyObject* py_perceptron(PyObject* self, PyObject* args) { 44 | const char* errmsg = "Arguments were not what was expected for perceptron.\n" 45 | "This is an internal function: Do not call directly unless you know exactly what you're doing.\n"; 46 | PyArrayObject* data; 47 | PyArrayObject* labels; 48 | PyArrayObject* weights; 49 | double eta; 50 | if (!PyArg_ParseTuple(args, "OOOd", &data, &labels, &weights, &eta)) { 51 | PyErr_SetString(PyExc_RuntimeError,errmsg); 52 | return 0; 53 | } 54 | if (!PyArray_Check(data) || !PyArray_ISCONTIGUOUS(data) || 55 | !PyArray_Check(weights) || !PyArray_ISCONTIGUOUS(weights) || 56 | !PyArray_Check(labels) || !PyArray_ISCONTIGUOUS(labels) || !PyArray_EquivTypenums(PyArray_TYPE(labels), NPY_LONG) || 57 | PyArray_TYPE(data) != PyArray_TYPE(weights)|| 58 | PyArray_NDIM(data) != 2 || PyArray_NDIM(weights) != 1 || PyArray_DIM(data,1) + 1 != PyArray_DIM(weights,0)) { 59 | PyErr_SetString(PyExc_RuntimeError,errmsg); 60 | return 0; 61 | } 62 | int nr_errors; 63 | if (PyArray_TYPE(data) == NPY_FLOAT) { 64 | nr_errors = perceptron(data, reinterpret_cast(PyArray_DATA(labels)), weights, eta); 65 | } else if (PyArray_TYPE(data) == NPY_DOUBLE) { 66 | nr_errors = perceptron(data, reinterpret_cast(PyArray_DATA(labels)), weights, eta); 67 | } else { 68 | PyErr_SetString(PyExc_RuntimeError, errmsg); 69 | return 0; 70 | } 71 | return PyLong_FromLong(nr_errors); 72 | } 73 | 74 | PyMethodDef methods[] = { 75 | {"perceptron", py_perceptron, METH_VARARGS , "Do NOT call directly.\n" }, 76 | {NULL, NULL,0,NULL}, 77 | }; 78 | 79 | const char * module_doc = 80 | "Internal Module.\n" 81 | "\n" 82 | "Do NOT use directly!\n"; 83 | 84 | } // namespace 85 | 86 | DECLARE_MODULE(_perceptron) 87 | 88 | -------------------------------------------------------------------------------- /milk/supervised/adaboost.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2012, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # License: MIT. See COPYING.MIT file in the milk distribution 5 | 6 | from __future__ import division 7 | import numpy as np 8 | from .normalise import normaliselabels 9 | from .base import supervised_model 10 | 11 | ''' 12 | AdaBoost 13 | 14 | Simple implementation of Adaboost 15 | 16 | Learner 17 | ------- 18 | 19 | boost_learner 20 | 21 | ''' 22 | 23 | __all__ = [ 24 | 'boost_learner', 25 | ] 26 | 27 | def _adaboost(features, labels, base, max_iters): 28 | m = len(features) 29 | D = np.ones(m, dtype=float) 30 | D /= m 31 | Y = np.ones(len(labels), dtype=float) 32 | names = np.array([-1, +1]) 33 | Y = names[labels] 34 | H = [] 35 | A = [] 36 | for t in range(max_iters): 37 | Ht = base.train(features, labels, weights=D) 38 | train_out = np.array(list(map(Ht.apply, features))) 39 | train_out = names[train_out.astype(int)] 40 | Et = np.dot(D, (Y != train_out)) 41 | if Et > .5: 42 | # early return 43 | break 44 | At = .5 * np.log((1. + Et) / (1. - Et)) 45 | D *= np.exp((-At) * Y * train_out) 46 | D /= np.sum(D) 47 | A.append(At) 48 | H.append(Ht) 49 | return H, A 50 | 51 | 52 | class boost_model(supervised_model): 53 | def __init__(self, H, A, names): 54 | self.H = H 55 | self.A = A 56 | self.names = names 57 | 58 | def apply(self, f): 59 | v = sum((a*h.apply(f)) for h,a in zip(self.H, self.A)) 60 | v /= np.sum(self.A) 61 | return self.names[v > .5] 62 | 63 | 64 | class boost_learner(object): 65 | ''' 66 | learner = boost_learner(weak_learner_type(), max_iters=100) 67 | model = learner.train(features, labels) 68 | test = model.apply(f) 69 | 70 | AdaBoost learner 71 | 72 | Attributes 73 | ---------- 74 | base : learner 75 | Weak learner 76 | max_iters : integer 77 | Nr of iterations (default: 100) 78 | ''' 79 | def __init__(self, base, max_iters=100): 80 | self.base = base 81 | self.max_iters = max_iters 82 | 83 | def train(self, features, labels, normalisedlabels=False, names=(0,1), weights=None, **kwargs): 84 | if not normalisedlabels: 85 | labels,names = normaliselabels(labels) 86 | H,A = _adaboost(features, labels, self.base, self.max_iters) 87 | return boost_model(H, A, names) 88 | -------------------------------------------------------------------------------- /milk/supervised/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2011, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # License: MIT. See COPYING.MIT file in the milk distribution 5 | 6 | from __future__ import division 7 | 8 | class supervised_model(object): 9 | def apply_many(self, fs): 10 | ''' 11 | labels = model.apply_many( examples ) 12 | 13 | This is equivalent to ``map(model.apply, examples)`` but may be 14 | implemented in a faster way. 15 | 16 | Parameters 17 | ---------- 18 | examples : sequence of training examples 19 | 20 | Returns 21 | ------- 22 | labels : sequence of labels 23 | ''' 24 | return list(map(self.apply, fs)) 25 | 26 | 27 | class base_adaptor(object): 28 | def __init__(self, base): 29 | self.base = base 30 | 31 | def set_option(self, k, v): 32 | self.base.set_option(k, v) 33 | -------------------------------------------------------------------------------- /milk/supervised/classifier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2015, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | from __future__ import division 24 | import numpy as np 25 | from .normalise import normaliselabels 26 | from .base import supervised_model 27 | 28 | __all__ = ['normaliselabels', 'ctransforms'] 29 | 30 | class threshold_model(object): 31 | ''' 32 | threshold_model 33 | 34 | Attributes 35 | ---------- 36 | threshold : float 37 | threshold value 38 | ''' 39 | def __init__(self, threshold=.5): 40 | self.threshold = .5 41 | 42 | def apply(self, f): 43 | return f >= self.threshold 44 | 45 | def __repr__(self): 46 | return 'threshold_model({})'.format(self.threshold) 47 | __str__ = __repr__ 48 | 49 | class fixed_threshold_learner(object): 50 | def __init__(self, threshold=.5): 51 | self.threshold = threshold 52 | def train(self, features, labels, **kwargs): 53 | return threshold_model(self.threshold) 54 | 55 | def __repr__(self): 56 | return 'fixed_threshold_learner({})'.format(self.threshold) 57 | __str__ = __repr__ 58 | 59 | 60 | class ctransforms_model(supervised_model): 61 | ''' 62 | model = ctransforms_model(models) 63 | 64 | A model that consists of a series of transformations. 65 | 66 | See Also 67 | -------- 68 | ctransforms 69 | ''' 70 | def __init__(self, models): 71 | self.models = models 72 | 73 | def apply_many(self, features): 74 | if len(features) == 0: 75 | return features 76 | for m in self.models: 77 | features = m.apply_many(features) 78 | return features 79 | 80 | def __repr__(self): 81 | return 'ctransforms_model({})'.format(self.models) 82 | __str__ = __repr__ 83 | 84 | def __getitem__(self, ix): 85 | return self.models[ix] 86 | 87 | def apply(self,features): 88 | for T in self.models: 89 | features = T.apply(features) 90 | return features 91 | 92 | class ctransforms(object): 93 | ''' 94 | ctransf = ctransforms(c0, c1, c2, ...) 95 | 96 | Concatenate transforms. 97 | ''' 98 | def __init__(self,*args): 99 | self.transforms = args 100 | 101 | 102 | def train(self, features, labels, **kwargs): 103 | models = [] 104 | model = None 105 | for T in self.transforms: 106 | if model is not None: 107 | features = np.array([model.apply(f) for f in features]) 108 | model = T.train(features, labels, **kwargs) 109 | models.append(model) 110 | return ctransforms_model(models) 111 | 112 | def __repr__(self): 113 | return 'ctransforms(*{})'.format(self.transforms) 114 | 115 | __str__ = __repr__ 116 | 117 | def set_option(self, opt, val): 118 | idx, opt = opt 119 | self.transforms[idx].set_option(opt,val) 120 | 121 | -------------------------------------------------------------------------------- /milk/supervised/defaultclassifier.py: -------------------------------------------------------------------------------- 1 | from milk.supervised.defaultlearner import * 2 | defaultclassifier = defaultlearner 3 | 4 | -------------------------------------------------------------------------------- /milk/supervised/grouped.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2010-2011, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # License: MIT. See COPYING.MIT file in the milk distribution 6 | # -*- coding: utf-8 -*- 7 | 8 | from __future__ import division 9 | import numpy as np 10 | from collections import defaultdict 11 | from .classifier import normaliselabels 12 | from .base import base_adaptor, supervised_model 13 | 14 | __all__ = [ 15 | 'voting_learner', 16 | 'mean_learner', 17 | 'remove_outliers', 18 | 'filter_outliers', 19 | ] 20 | 21 | def _concatenate_features_labels(gfeatures, glabels): 22 | if type(gfeatures) == np.ndarray and gfeatures.dtype == object: 23 | gfeatures = list(gfeatures) 24 | features = np.concatenate(gfeatures) 25 | labels = [] 26 | for feats,label in zip(gfeatures, glabels): 27 | labels.extend( [label] * len(feats) ) 28 | return features, labels 29 | 30 | class voting_learner(base_adaptor): 31 | ''' 32 | Implements a voting scheme for multiple sub-examples per example. 33 | 34 | classifier = voting_learner(base) 35 | 36 | base should be a binary classifier 37 | 38 | Example 39 | ------- 40 | 41 | :: 42 | 43 | voterlearn = voting_learner(milk.supervised.simple_svm()) 44 | voter = voterlearn.train(training_groups, labeled_groups) 45 | res = voter.apply([ [f0, f1, f3] ]) 46 | 47 | ''' 48 | 49 | def train(self, gfeatures, glabels, normalisedlabels=False): 50 | features, labels = _concatenate_features_labels(gfeatures, glabels) 51 | return voting_model(self.base.train(features, labels)) 52 | voting_classifier = voting_learner 53 | 54 | 55 | class voting_model(supervised_model): 56 | def __init__(self, base): 57 | self.base = base 58 | 59 | def apply(self, gfeatures): 60 | votes = defaultdict(int) 61 | for feats in gfeatures: 62 | votes[self.base.apply(feats)] += 1 63 | best = None 64 | most_votes = 0 65 | for k,v in votes.items(): 66 | if v > most_votes: 67 | best = k 68 | most_votes = v 69 | return best 70 | 71 | class mean_learner(base_adaptor): 72 | ''' 73 | Implements a mean scheme for multiple sub-examples per example. 74 | 75 | classifier = mean_learner(base) 76 | 77 | `base` should be a classifier that returns a numeric confidence value 78 | `classifier` will return the **mean** 79 | 80 | Example 81 | ------- 82 | 83 | :: 84 | 85 | meanlearner = mean_learner(milk.supervised.raw_svm()) 86 | model = meanlearner.train(training_groups, labeled_groups) 87 | res = model.apply([ [f0, f1, f3] ]) 88 | 89 | ''' 90 | def train(self, gfeatures, glabels, normalisedlabels=False): 91 | features, labels = _concatenate_features_labels(gfeatures, glabels) 92 | return mean_model(self.base.train(features, labels)) 93 | 94 | mean_classifier = mean_learner 95 | 96 | class mean_model(supervised_model): 97 | def __init__(self, base): 98 | self.base = base 99 | 100 | def apply(self, gfeatures): 101 | return np.mean([self.base.apply(feats) for feats in gfeatures]) 102 | 103 | 104 | def remove_outliers(features, limit, min_size): 105 | ''' 106 | features = remove_outliers(features, limit, min_size) 107 | 108 | ''' 109 | nsize = int(limit * len(features)) 110 | if nsize < min_size: 111 | return features 112 | 113 | normed = features - features.mean(0) 114 | std = normed.std(0) 115 | std[std == 0] = 1 116 | normed /= std 117 | f2_sum1 = (normed**2).mean(1) 118 | values = f2_sum1.copy() 119 | values.sort() 120 | top = values[nsize] 121 | selected = f2_sum1 < top 122 | return features[selected] 123 | 124 | 125 | class filter_outliers_model(supervised_model): 126 | def __init__(self, limit, min_size): 127 | self.limit = limit 128 | self.min_size = min_size 129 | 130 | def apply(self, features): 131 | return remove_outliers(features, self.limit, self.min_size) 132 | 133 | class filter_outliers(object): 134 | def __init__(self, limit=.9, min_size=3): 135 | self.limit = limit 136 | self.min_size = min_size 137 | 138 | def train(self, features, labels, normalisedlabels=False): 139 | return filter_outliers_model(self.limit, self.min_size) 140 | 141 | -------------------------------------------------------------------------------- /milk/supervised/knn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2012, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # License: MIT. See COPYING.MIT file in the milk distribution 6 | 7 | from __future__ import division 8 | from collections import defaultdict 9 | from milk.utils import get_nprandom 10 | import numpy as np 11 | from .base import supervised_model 12 | 13 | __all__ = [ 14 | 'kNN', 15 | 'knn_learner', 16 | 'approximate_knn_learner', 17 | ] 18 | 19 | def _plurality(xs): 20 | from collections import defaultdict 21 | counts = defaultdict(int) 22 | for x in xs: counts[x] += 1 23 | best,_ = max(iter(counts.items()), key=(lambda k_v: k_v[1])) 24 | return best 25 | 26 | class kNN(object): 27 | ''' 28 | k-Nearest Neighbour Classifier 29 | 30 | Naive implementation of a k-nearest neighbour classifier. 31 | 32 | C = kNN(k) 33 | 34 | Attributes: 35 | ----------- 36 | k : integer 37 | number of neighbours to consider 38 | ''' 39 | 40 | 41 | def __init__(self, k=1): 42 | self.k = k 43 | 44 | def train(self, features, labels, normalisedlabels=False, copy_features=False): 45 | features = np.asanyarray(features) 46 | labels = np.asanyarray(labels) 47 | if copy_features: 48 | features = features.copy() 49 | labels = labels.copy() 50 | features2 = np.sum(features**2, axis=1) 51 | return kNN_model(self.k, features, features2, labels) 52 | 53 | knn_learner = kNN 54 | 55 | class kNN_model(supervised_model): 56 | def __init__(self, k, features, features2, labels): 57 | self.k = k 58 | self.features = features 59 | self.f2 = features2 60 | self.labels = labels 61 | 62 | def apply(self, features): 63 | features = np.asanyarray(features) 64 | diff2 = np.dot(self.features, (-2.)*features) 65 | diff2 += self.f2 66 | neighbours = diff2.argsort()[:self.k] 67 | labels = self.labels[neighbours] 68 | return _plurality(labels) 69 | 70 | 71 | class approximate_knn_model(supervised_model): 72 | def __init__(self, k, X, projected): 73 | self.k = k 74 | self.X = X 75 | self.projected = projected 76 | self.p2 = np.array([np.dot(p,p) for p in projected]) 77 | 78 | def apply(self, t): 79 | tx = np.dot(self.X.T, t) 80 | d = np.dot(self.projected,tx) 81 | d *= -2 82 | d += self.p2 83 | if self.k == 1: 84 | return np.array([d.argmin()]) 85 | d = d.argsort() 86 | return d[:self.k] 87 | 88 | class approximate_knn_classification_model(supervised_model): 89 | def __init__(self, k, X, projected, labels): 90 | self.base = approximate_knn_model(k, X, projected) 91 | self.labels = labels 92 | 93 | def apply(self, f): 94 | idxs = self.base.apply(f) 95 | return _plurality(self.labels[idxs]) 96 | 97 | class approximate_knn_learner(object): 98 | ''' 99 | approximate_knn_learner 100 | 101 | Learns a k-nearest neighbour classifier, where the proximity is approximate 102 | as it is computed on a small dimensional subspace (random subspace 103 | projection). For many datasets, this is acceptable. 104 | ''' 105 | 106 | def __init__(self, k, ndims=8): 107 | self.k = k 108 | self.ndims = ndims 109 | def train(self, features, labels, **kwargs): 110 | labels = np.asanyarray(labels) 111 | R = get_nprandom(kwargs.get('R')) 112 | _, n_features = features.shape 113 | X = R.random_sample((n_features, self.ndims)) 114 | projected = np.dot(features, X) 115 | return approximate_knn_classification_model(self.k, X, projected, labels.copy()) 116 | 117 | -------------------------------------------------------------------------------- /milk/supervised/logistic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2011, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # License: MIT. See COPYING.MIT file in the milk distribution 5 | 6 | from __future__ import division 7 | import numpy as np 8 | from .normalise import normaliselabels 9 | from .base import supervised_model 10 | 11 | __all__ = [ 12 | 'logistic_learner', 13 | ] 14 | 15 | @np.vectorize 16 | def _sigmoidal(z): 17 | if (z > 300): return 1. 18 | if z < -300: return 0. 19 | return 1./(1+np.exp(-z)) 20 | 21 | class logistic_model(supervised_model): 22 | def __init__(self, bs): 23 | self.bs = bs 24 | 25 | def apply(self, fs): 26 | return _sigmoidal(self.bs[0] + np.dot(fs, self.bs[1:])) 27 | 28 | class logistic_learner(object): 29 | ''' 30 | learner = logistic_learner(alpha=0.0) 31 | 32 | Logistic regression learner 33 | 34 | There are two implementations: 35 | 36 | 1. One which depends on ``scipy.optimize``. This is the default and is 37 | extremely fast. 38 | 2. If ``import scipy`` fails, then we fall back to a Python only 39 | gradient-descent. This gives good results, but is many times slower. 40 | 41 | Properties 42 | ---------- 43 | 44 | alpha : real, optional 45 | penalty for L2-normalisation. Default is zero, for no penalty. 46 | 47 | ''' 48 | def __init__(self, alpha=0.0): 49 | self.alpha = alpha 50 | 51 | def train(self, features, labels, normalisedlabels=False, names=None, **kwargs): 52 | def error(bs): 53 | response = bs[0] + np.dot(features, bs[1:]) 54 | response = _sigmoidal(response) 55 | diff = response - labels 56 | log_like = np.dot(diff, diff) 57 | L2_penalty = self.alpha * np.dot(bs, bs) 58 | return log_like + L2_penalty 59 | def error_prime(bs): 60 | fB = np.dot(features, bs[1:]) 61 | response = _sigmoidal(bs[0] + fB) 62 | sprime = response * (1-response) 63 | ds = (response - labels) * sprime 64 | b0p = np.sum(ds) 65 | b1p = np.dot(features.T, ds) 66 | bp = np.concatenate( ([b0p], b1p) ) 67 | return 2.*(bp + self.alpha*bs) 68 | 69 | features = np.asanyarray(features) 70 | if not normalisedlabels: 71 | labels, _ = normaliselabels(labels) 72 | N,f = features.shape 73 | bs = np.zeros(f+1) 74 | try: 75 | from scipy import optimize 76 | # Some testing revealed that this was a good combination 77 | # call fmin_cg twice first and then fmin 78 | # I do not understand why 100%, but there it is 79 | bs = optimize.fmin_cg(error, bs, error_prime, disp=False) 80 | bs = optimize.fmin_cg(error, bs, error_prime, disp=False) 81 | bs = optimize.fmin(error, bs, disp=False) 82 | except ImportError: 83 | import warnings 84 | warnings.warn('''\ 85 | milk.supervised.logistic.train: Could not import scipy.optimize. 86 | Fall back to very simple gradient descent (which is slow).''') 87 | bs = np.zeros(f+1) 88 | cur = 1.e-6 89 | ebs = error(bs) 90 | for i in range(1000000): 91 | dir = error_prime(bs) 92 | step = (lambda e : bs - e *dir) 93 | enbs = ebs + 1 94 | while enbs > ebs: 95 | cur /= 2. 96 | if cur == 0.: 97 | break 98 | nbs = step(cur) 99 | enbs = error(nbs) 100 | while cur < 10.: 101 | cur *= 2 102 | nnbs = step(cur) 103 | ennbs = error(nnbs) 104 | if ennbs < enbs: 105 | nbs = nnbs 106 | enbs = ennbs 107 | else: 108 | break 109 | bs = nbs 110 | ebs = enbs 111 | return logistic_model(bs) 112 | -------------------------------------------------------------------------------- /milk/supervised/multi_label.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2011-2015, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # License: MIT. See COPYING.MIT file in the milk distribution 5 | 6 | from __future__ import division 7 | import numpy as np 8 | from .base import supervised_model, base_adaptor 9 | 10 | class one_by_one_model(supervised_model): 11 | def __init__(self, models): 12 | self.models = models 13 | 14 | def apply(self, fs): 15 | result = [] 16 | for ell,model in self.models.items(): 17 | if model.apply(fs): 18 | result.append(ell) 19 | return result 20 | 21 | 22 | class one_by_one(base_adaptor): 23 | ''' 24 | Implements 1-vs-all multi-label classifier by transforming a base (binary) 25 | classifier. 26 | 27 | Example 28 | ------- 29 | 30 | features = [....] 31 | labels = [ 32 | (0,), 33 | (1,2), 34 | (0,2), 35 | (0,3), 36 | (1,2,3), 37 | (2,0), 38 | ... 39 | ] 40 | learner = one_by_one(milk.defaultlearner()) 41 | model = learner.train(features, labels) 42 | ''' 43 | def train(self, features, labels, **kwargs): 44 | universe = set() 45 | for ls in labels: 46 | universe.update(ls) 47 | models = {} 48 | for ell in universe: 49 | contained = np.array([int(ell in ls) for ls in labels]) 50 | models[ell] = self.base.train(features, contained, normalisedlabels=True) 51 | return one_by_one_model(models) 52 | 53 | def __str__(self): 54 | return 'one_by_one({})'.format(self.base) 55 | -------------------------------------------------------------------------------- /milk/supervised/multi_view.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2011, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # License: MIT. See COPYING.MIT file in the milk distribution 6 | 7 | import numpy as np 8 | 9 | 10 | __all__ = [ 11 | 'multi_view_learner', 12 | ] 13 | class multi_view_model(object): 14 | def __init__(self, models): 15 | self.models = models 16 | 17 | def apply(self, features): 18 | if len(features) != len(self.models): 19 | raise ValueError('milk.supervised.two_view: Nr of features does not match training data (got %s, expected %s)' % (len(features) ,len(self.models))) 20 | Ps = np.array([model.apply(f) for model,f in zip(self.models, features)]) 21 | if np.any(Ps <= 0.): return False 22 | if np.any(Ps >= 1.): return True 23 | # This is binary only: 24 | # if \prod Pi > \prod (1-Pi) return 1 25 | # is equivalent to 26 | # if \prod Pi/(1-Pi) > 1. return 1 27 | # if \sum \log( Pi/(1-Pi) ) > 0. return 1 28 | return np.sum( np.log(Ps/(1-Ps)) ) > 0 29 | 30 | 31 | class multi_view_learner(object): 32 | ''' 33 | Multi View Learner 34 | 35 | This learner learns different classifiers on multiple sets of features and 36 | combines them for classification. 37 | 38 | ''' 39 | def __init__(self, bases): 40 | self.bases = bases 41 | 42 | def train(self, features, labels, normalisedlabels=False): 43 | features = list(zip(*features)) 44 | if len(features) != len(self.bases): 45 | raise ValueError('milk.supervised.multi_view_learner: ' + 46 | 'Nr of features does not match classifiser construction (got %s, expected %s)' 47 | % (len(features) ,len(self.bases))) 48 | models = [] 49 | for basis,f in zip(self.bases, features): 50 | try: 51 | f = np.array(f) 52 | except: 53 | f = np.array(f, dtype=object) 54 | models.append(basis.train(f, labels)) 55 | return multi_view_model(models) 56 | 57 | multi_view_classifier = multi_view_learner 58 | -------------------------------------------------------------------------------- /milk/supervised/normalise.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2012, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # License: MIT. See COPYING.MIT file in the milk distribution 6 | 7 | from __future__ import division 8 | import numpy as np 9 | from .base import supervised_model 10 | from ..unsupervised.normalise import zscore 11 | 12 | __all__ = [ 13 | 'zscore', 14 | 'zscore_normalise', 15 | 'interval_normalise', 16 | 'chkfinite', 17 | 'sample_to_2min', 18 | 'normaliselabels' 19 | ] 20 | 21 | 22 | class subtract_divide_model(supervised_model): 23 | def __init__(self, shift, factor): 24 | factor[factor == 0] = 1 # This makes the division a null op. 25 | 26 | self.shift = shift 27 | self.factor = factor 28 | 29 | def apply_many(self, features): 30 | if len(features) == 0: 31 | return features 32 | return (features - self.shift)/self.factor 33 | 34 | def apply(self, features): 35 | return (features - self.shift)/self.factor 36 | 37 | def __repr__(self): 38 | return 'subtract_divide_model(%s, %s)' % (self.shift, self.factor) 39 | 40 | class zscore_normalise(object): 41 | ''' 42 | Normalise to z-scores 43 | 44 | A preprocessor that normalises features to z scores. 45 | ''' 46 | 47 | def train(self, features, labels, **kwargs): 48 | shift = features.mean(0) 49 | factor = np.std(features,0) 50 | return subtract_divide_model(shift, factor) 51 | 52 | class interval_normalise(object): 53 | ''' 54 | Linearly scale to the interval [-1,1] (per libsvm recommendation) 55 | 56 | ''' 57 | def train(self, features, labels, **kwargs): 58 | ptp = features.ptp(0) 59 | shift = features.min(0) + ptp/2. 60 | factor = ptp/2. 61 | return subtract_divide_model(shift, factor) 62 | 63 | def __repr__(self): 64 | return 'interval_normalise()' 65 | 66 | 67 | def sample_to_2min(labels): 68 | ''' 69 | selected = sample_to_2min(labels) 70 | 71 | Select examples so that the ratio of size of the largest 72 | class to the smallest class is at most two (i.e., 73 | min_label_count = min { (labels == L).sum() | for L in set(labels) } 74 | for L' in set(labels): 75 | assert (labels == L').sum() <= 2 * min_label_count 76 | ) 77 | 78 | Parameters 79 | ---------- 80 | labels : sequence of labels 81 | 82 | Returns 83 | ------- 84 | selected : a Boolean numpy.ndarray 85 | ''' 86 | from collections import defaultdict 87 | counts = defaultdict(int) 88 | for n in labels: 89 | counts[n] += 1 90 | 91 | labels = np.asanyarray(labels) 92 | max_entries = np.min(list(counts.values()))*2 93 | selected = np.zeros(len(labels), bool) 94 | for c in counts.keys(): 95 | p, = np.where(labels == c) 96 | p = p[:max_entries] 97 | selected[p] = 1 98 | return selected 99 | 100 | 101 | 102 | class chkfinite(supervised_model): 103 | ''' 104 | Fill NaN & Inf values 105 | 106 | Replaces NaN & Inf values with zeros. 107 | ''' 108 | def __init__(self): 109 | pass 110 | 111 | def train(self, features, labels, **kwargs): 112 | return self 113 | 114 | def apply(self, features): 115 | nans = np.isnan(features) + np.isinf(features) 116 | if nans.any(): 117 | features = features.copy() 118 | features[nans] = 0 119 | return features 120 | 121 | def __repr__(self): 122 | return 'chkfinite()' 123 | 124 | def normaliselabels(labels, multi_label=False): 125 | ''' 126 | normalised, names = normaliselabels(labels, multi_label=False) 127 | 128 | If not ``multi_label`` (the default), normalises the labels to be integers 129 | from 0 through N-1. Otherwise, assume that each label is actually a 130 | sequence of labels. 131 | 132 | ``normalised`` is a np.array, while ``names`` is a list mapping the indices to 133 | the old names. 134 | 135 | Parameters 136 | ---------- 137 | labels : any iterable of labels 138 | multi_label : bool, optional 139 | Whether labels are actually composed of multiple labels 140 | 141 | Returns 142 | ------ 143 | normalised : a numpy ndarray 144 | If not ``multi_label``, this is an array of integers 0 .. N-1; 145 | otherwise, it is a boolean array of size len(labels) x N 146 | names : list of label names 147 | ''' 148 | if multi_label: 149 | names = set() 150 | for ell in labels: names.update(ell) 151 | names = list(sorted(names)) 152 | normalised = np.zeros( (len(labels), len(names)), bool) 153 | for i,ls in enumerate(labels): 154 | for ell in map(names.index, ls): 155 | normalised[i,ell] = True 156 | return normalised, names 157 | else: 158 | names = sorted(set(labels)) 159 | normalised = list(map(names.index, labels)) 160 | return np.array(normalised), names 161 | 162 | -------------------------------------------------------------------------------- /milk/supervised/parzen.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2011, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # License: MIT. See COPYING.MIT file in the milk distribution 6 | 7 | from __future__ import division 8 | import numpy as np 9 | 10 | def get_parzen_rbf_loocv(features,labels): 11 | xij = np.dot(features,features.T) 12 | f2 = np.sum(features**2,1) 13 | d = f2-2*xij 14 | d = d.T + f2 15 | d_argsorted = d.argsort(1) 16 | d_sorted = d.copy() 17 | d_sorted.sort(1) 18 | e_d = np.exp(-d_sorted) 19 | labels_sorted = labels[d_argsorted].astype(np.double) 20 | labels_sorted *= 2 21 | labels_sorted -= 1 22 | def f(sigma): 23 | k = e_d ** (1./sigma) 24 | return (((k[:,1:] * labels_sorted[:,1:]).sum(1) > 0) == labels).mean() 25 | return f 26 | 27 | 28 | -------------------------------------------------------------------------------- /milk/supervised/perceptron.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2011, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # License: MIT. See COPYING.MIT file in the milk distribution 6 | 7 | import numpy as np 8 | from .classifier import normaliselabels 9 | from .base import supervised_model 10 | from . import _perceptron 11 | 12 | class perceptron_model(supervised_model): 13 | def __init__(self, w): 14 | self.w = w 15 | 16 | def apply(self, f): 17 | f = np.asanyarray(f) 18 | v = self.w[0] + np.dot(f, self.w[1:]) 19 | return v > 0 20 | 21 | class perceptron_learner(object): 22 | def __init__(self, eta=.1, max_iters=128): 23 | self.eta = eta 24 | self.max_iters = max_iters 25 | 26 | def train(self, features, labels, normalisedlabels=False, **kwargs): 27 | if not normalisedlabels: 28 | labels, _ = normaliselabels(labels) 29 | features = np.asanyarray(features) 30 | if features.dtype not in (np.float32, np.float64): 31 | features = features.astype(np.float64) 32 | weights = np.zeros(features.shape[1]+1, features.dtype) 33 | for i in range(self.max_iters): 34 | errors = _perceptron.perceptron(features, labels, weights, self.eta) 35 | if not errors: 36 | break 37 | return perceptron_model(weights) 38 | 39 | 40 | -------------------------------------------------------------------------------- /milk/supervised/precluster_learner.py: -------------------------------------------------------------------------------- 1 | from .precluster import * 2 | -------------------------------------------------------------------------------- /milk/supervised/randomforest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2010-2011, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # License: MIT. See COPYING.MIT file in the milk distribution 6 | 7 | ''' 8 | Random Forest 9 | ------------- 10 | 11 | Main elements 12 | ------------- 13 | 14 | rf_learner : A learner object 15 | ''' 16 | 17 | from __future__ import division 18 | import numpy as np 19 | import milk.supervised.tree 20 | from .normalise import normaliselabels 21 | from .base import supervised_model 22 | from ..utils import get_nprandom 23 | 24 | __all__ = [ 25 | 'rf_learner', 26 | ] 27 | 28 | def _sample(features, labels, n, R): 29 | ''' 30 | features', labels' = _sample(features, labels, n, R) 31 | 32 | Sample n element from (features,labels) 33 | 34 | Parameters 35 | ---------- 36 | features : sequence 37 | labels : sequence 38 | Same size as labels 39 | n : integer 40 | R : random object 41 | 42 | Returns 43 | ------- 44 | features' : sequence 45 | labels' : sequence 46 | ''' 47 | 48 | N = len(features) 49 | sfeatures = [] 50 | slabels = [] 51 | for i in range(n): 52 | idx = R.randint(N) 53 | sfeatures.append(features[idx]) 54 | slabels.append(labels[idx]) 55 | return np.array(sfeatures), np.array(slabels) 56 | 57 | class rf_model(supervised_model): 58 | def __init__(self, forest, names, return_label = True): 59 | self.forest = forest 60 | self.names = names 61 | self.return_label = return_label 62 | 63 | def apply(self, features): 64 | rf = len(self.forest) 65 | votes = sum(t.apply(features) for t in self.forest) 66 | if self.return_label: 67 | return (votes > (rf//2)) 68 | return votes / rf 69 | 70 | 71 | class rf_learner(object): 72 | ''' 73 | Random Forest Learner 74 | 75 | learner = rf_learner(rf=101, frac=.7) 76 | 77 | Attributes 78 | ---------- 79 | rf : integer, optional 80 | Nr of trees to learn (default: 101) 81 | frac : float, optional 82 | Sample fraction 83 | R : np.random object 84 | Source of randomness 85 | ''' 86 | def __init__(self, rf=101, frac=.7, R=None): 87 | self.rf = rf 88 | self.frac = frac 89 | self.R = get_nprandom(R) 90 | 91 | def train(self, features, labels, normalisedlabels=False, names=None, return_label=True, **kwargs): 92 | N,M = features.shape 93 | m = int(self.frac*M) 94 | n = int(self.frac*N) 95 | R = get_nprandom(kwargs.get('R', self.R)) 96 | tree = milk.supervised.tree.tree_learner(return_label=return_label) 97 | forest = [] 98 | if not normalisedlabels: 99 | labels,names = normaliselabels(labels) 100 | elif names is None: 101 | names = (0,1) 102 | for i in range(self.rf): 103 | forest.append( 104 | tree.train(*_sample(features, labels, n, R), 105 | **{'normalisedlabels' : True})) # This syntax is necessary for Python 2.5 106 | return rf_model(forest, names, return_label) 107 | 108 | 109 | -------------------------------------------------------------------------------- /milk/supervised/set2binary_array.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2011, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # License: MIT. See COPYING.MIT file in the milk distribution 6 | 7 | 8 | import numpy as np 9 | 10 | __all__ = [ 11 | 'set2binary_array', 12 | ] 13 | 14 | class set2binary_array_model(object): 15 | def __init__(self, universe): 16 | self.universe = list(universe) 17 | 18 | def apply(self, elems): 19 | res = np.zeros(len(self.universe) + 1, bool) 20 | for e in elems: 21 | try: 22 | res[self.universe.index(e)] = True 23 | except : 24 | res[-1] = True 25 | return res 26 | 27 | class set2binary_array(object): 28 | def train(self, features, labels, normalisedlabels=False): 29 | allfeatures = set() 30 | for f in features: 31 | allfeatures.update(f) 32 | return set2binary_array_model(allfeatures) 33 | -------------------------------------------------------------------------------- /milk/supervised/weighted_voting_adaboost.py: -------------------------------------------------------------------------------- 1 | from math import exp, log 2 | from operator import itemgetter 3 | 4 | ''' 5 | AdaBoost implementation with weighted voting as a decision procedure 6 | ''' 7 | class weighted_voting_adaboost(object): 8 | # initializes with already built classifiers and corresponding 9 | def __init__(self, in_classifiers, in_coefficients): 10 | self.classifiers = in_classifiers 11 | self.coefficients = in_coefficients 12 | 13 | # decision by weighted voting 14 | def apply(self, in_features): 15 | # a "class number" => "votes value" mapping 16 | answers = {} 17 | for classifier, coefficient in zip(self.classifiers, self.coefficients): 18 | answer = classifier.apply(in_features) 19 | if answer in answers: 20 | answers[answer] += coefficient 21 | else: 22 | answers[answer] = coefficient 23 | # dict maximum by value 24 | result = max(iter(answers.items()), key=itemgetter(1)) 25 | return result[0] 26 | 27 | 28 | class weighted_voting_ada_learner(object): 29 | def __init__(self, in_composition_size, in_learner): 30 | self.learner = in_learner 31 | self.composition_size = in_composition_size 32 | 33 | def reset(self, in_features): 34 | self.classifiers = [] 35 | # linear coefficients for the classifiers in composition 36 | self.coefficients = [] 37 | self.weights = [1. / float(len(in_features))] * len(in_features) 38 | 39 | def train(self, in_features, in_labels): 40 | self.reset(in_features) 41 | 42 | for iteration in range(self.composition_size): 43 | self.classifiers.append(self.learner.train(in_features, in_labels, weights=self.weights)) 44 | # new classifier initially gets weight 1 45 | self.coefficients.append(1) 46 | answers = [] 47 | for obj in in_features: 48 | answers.append(self.classifiers[-1].apply(obj)) 49 | err = self.compute_weighted_error(in_labels, answers) 50 | if abs(err) < 1e-6: 51 | return weighted_voting_adaboost(self.classifiers, self.coefficients) 52 | 53 | alpha = 0.5 * log((1.0 - err) / err) 54 | # updating the coefficient of the last added classifier 55 | self.coefficients[-1] = alpha 56 | 57 | self.update_weights(in_labels, answers, alpha) 58 | self.normalize_weights() 59 | return weighted_voting_adaboost(self.classifiers, self.coefficients) 60 | 61 | def compute_weighted_error(self, in_labels, in_answers): 62 | error = 0. 63 | w_sum = sum(self.weights) 64 | for ind in range(len(in_labels)): 65 | error += (in_answers[ind] != in_labels[ind]) * self.weights[ind] / w_sum 66 | return error 67 | 68 | def update_weights(self, in_labels, in_answers, in_alpha): 69 | for ind in range(len(in_labels)): 70 | self.weights[ind] *= exp(in_alpha * (in_answers[ind] != in_labels[ind])) 71 | 72 | def normalize_weights(self): 73 | w_sum = sum(self.weights) 74 | 75 | for ind in range(len(self.weights)): 76 | self.weights[ind] /= w_sum 77 | -------------------------------------------------------------------------------- /milk/tests/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import milksets 3 | del milksets 4 | except ImportError: 5 | import sys 6 | sys.stderr.write('''\ 7 | Could not import milksets. 8 | 9 | This companion package does not provide any functionality, but 10 | is necessary for some of the testing.''') 11 | 12 | 13 | def run(verbose=False): 14 | import nose 15 | from os import path 16 | currentdir = path.dirname(__file__) 17 | updir = path.join(currentdir, '..') 18 | argv = ['', '--exe', '-w', updir] 19 | if verbose: 20 | argv.append('--verbose') 21 | nose.run('milk', argv=argv) 22 | 23 | -------------------------------------------------------------------------------- /milk/tests/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/milk/abc2a28b526c199414d42c0a26092938968c3caf/milk/tests/data/__init__.py -------------------------------------------------------------------------------- /milk/tests/data/jugparallel_jugfile.py: -------------------------------------------------------------------------------- 1 | import milk.ext.jugparallel 2 | from milksets.wine import load 3 | from milk.tests.fast_classifier import fast_classifier 4 | features,labels = load() 5 | classified = milk.ext.jugparallel.nfoldcrossvalidation(features, labels, learner=fast_classifier()) 6 | classified_wpred = milk.ext.jugparallel.nfoldcrossvalidation(features, labels, learner=fast_classifier(), return_predictions=True) 7 | 8 | -------------------------------------------------------------------------------- /milk/tests/data/jugparallel_kmeans_jugfile.py: -------------------------------------------------------------------------------- 1 | import milk.ext.jugparallel 2 | from milksets.wine import load 3 | from milk.tests.fast_classifier import fast_classifier 4 | features,labels = load() 5 | 6 | clustered = milk.ext.jugparallel.kmeans_select_best(features, ks=(2,8), repeats=2, max_iters=6) 7 | 8 | -------------------------------------------------------------------------------- /milk/tests/data/regression-2-Dec-2009.pp.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/milk/abc2a28b526c199414d42c0a26092938968c3caf/milk/tests/data/regression-2-Dec-2009.pp.gz -------------------------------------------------------------------------------- /milk/tests/fast_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from milk.supervised.base import supervised_model 3 | class fast_classifier(object): 4 | def __init__(self): 5 | pass 6 | 7 | def set_option(self, _k, _v): 8 | pass 9 | 10 | def train(self, features, labels, **kwargs): 11 | examples = {} 12 | for f,lab in zip(features, labels): 13 | if lab not in examples: 14 | examples[lab] = f 15 | return fast_model(examples) 16 | 17 | class fast_model(supervised_model): 18 | def __init__(self, examples): 19 | self.examples = examples 20 | assert len(self.examples) 21 | 22 | def apply(self, f): 23 | best = None 24 | best_val = +np.inf 25 | for k,v in self.examples.items(): 26 | d = v-f 27 | dist = np.dot(d,d) 28 | if dist < best_val: 29 | best = k 30 | best_val = dist 31 | return best 32 | 33 | 34 | -------------------------------------------------------------------------------- /milk/tests/test_adaboost.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import milk.supervised.tree 3 | import milk.supervised.adaboost 4 | def test_learner(): 5 | from milksets import wine 6 | learner = milk.supervised.adaboost.boost_learner(milk.supervised.tree.stump_learner()) 7 | features, labels = wine.load() 8 | features = features[labels < 2] 9 | labels = labels[labels < 2] == 0 10 | labels = labels.astype(int) 11 | model = learner.train(features[::2], labels[::2]) 12 | train_out = np.array(list(map(model.apply, features))) 13 | assert (train_out == labels).mean() > .9 14 | 15 | 16 | def test_too_many_boolean_indices_regression(): 17 | import milk.supervised.randomforest 18 | import milk.supervised.adaboost 19 | import milksets.wine 20 | from milk.supervised.multi import one_against_one 21 | 22 | weak = milk.supervised.randomforest.rf_learner() 23 | learner = milk.supervised.adaboost.boost_learner(weak) 24 | learner = one_against_one(learner) 25 | 26 | features, labels = milksets.wine.load() 27 | 28 | # sample features so that the test is faster (still gives error): 29 | learner.train(features[::16], labels[::16]) 30 | -------------------------------------------------------------------------------- /milk/tests/test_affinity.py: -------------------------------------------------------------------------------- 1 | import milk.unsupervised.affinity 2 | import numpy as np 3 | def test_affinity(): 4 | np.random.seed(22) 5 | X = np.random.randn(100,10) 6 | X[:40] += .4 7 | S = milk.unsupervised.pdist(X) 8 | clusters, labels = milk.unsupervised.affinity.affinity_propagation(S) 9 | assert labels.max()+1 == len(clusters) 10 | assert len(labels) == len(X) 11 | assert clusters.max() < len(X) 12 | -------------------------------------------------------------------------------- /milk/tests/test_basic.py: -------------------------------------------------------------------------------- 1 | def test_import(): 2 | import milk 3 | -------------------------------------------------------------------------------- /milk/tests/test_curves.py: -------------------------------------------------------------------------------- 1 | from milk.measures.curves import precision_recall 2 | import numpy as np 3 | def test_precision_recall(): 4 | labels = [0,1]*10 5 | values = np.linspace(0,1,len(labels)) 6 | precision, recall = precision_recall(values, labels) 7 | assert np.min(recall) >= 0. 8 | assert np.max(recall) <= 1. 9 | assert np.max(precision) <= 1. 10 | assert np.min(precision) >= 0. 11 | 12 | labels = [0]*10 + [1] * 10 13 | values = np.linspace(0,1.,20) 14 | precision,recall = precision_recall(values, labels, 'steps', 10) 15 | assert min(precision) >= .5 16 | assert max(precision) == 1. 17 | assert max(recall) == 1. 18 | 19 | -------------------------------------------------------------------------------- /milk/tests/test_defaultclassifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import milk 3 | import milk.supervised.defaultclassifier 4 | import pickle 5 | 6 | def test_defaultclassifier(): 7 | from milksets import wine 8 | features, labels = wine.load() 9 | C = milk.supervised.defaultclassifier() 10 | model = C.train(features,labels) 11 | labelset = set(labels) 12 | for f in features: 13 | assert model.apply(f) in labelset 14 | test_defaultclassifier.slow = True 15 | 16 | def test_pickle(): 17 | np.random.seed(23232432) 18 | X = np.random.rand(100,10) 19 | labels = np.zeros(100) 20 | X[50:] += .5 21 | labels[50:] = 1 22 | classifier = milk.supervised.defaultclassifier() 23 | model = classifier.train(X, labels) 24 | s = pickle.dumps(model) 25 | model = pickle.loads(s) 26 | test = [model.apply(x) for x in X] 27 | test = np.array(test) 28 | assert (test == labels).mean() > .6 29 | 30 | def test_pickle_learner(): 31 | learner = milk.defaultlearner() 32 | assert len(pickle.dumps(learner)) 33 | 34 | def test_expandend(): 35 | np.random.seed(23232432) 36 | X = np.random.rand(100,10) 37 | labels = np.zeros(100) 38 | X[50:] += .5 39 | labels[50:] = 1 40 | learners = milk.defaultlearner(expanded=True) 41 | for learner in learners: 42 | model = learner.train(X, labels) 43 | test = [model.apply(x) for x in X] 44 | test = np.array(test) 45 | assert set(test) == set(labels) 46 | 47 | -------------------------------------------------------------------------------- /milk/tests/test_defaultlearner.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import milk 3 | def test_extra_arg(): 4 | from milksets.wine import load 5 | features,labels = load() 6 | learner = milk.defaultlearner() 7 | model = learner.train(features[::2],labels[::2], extra_arg=5) 8 | assert model.apply(features[1]) < 12. 9 | 10 | 11 | def test_empty_input(): 12 | learn = milk.defaultlearner() 13 | X = np.random.rand(60, 3) 14 | X[:32] += .52 15 | y = np.arange(60) > 35 16 | model = learn.train(X, y) 17 | preds = model.apply_many([]) 18 | assert len(preds) == 0 19 | -------------------------------------------------------------------------------- /milk/tests/test_ecoc_learner.py: -------------------------------------------------------------------------------- 1 | from milk.supervised.multi import ecoc_learner 2 | from milk.supervised.classifier import ctransforms 3 | from milk.supervised import svm 4 | import milk.tests.fast_classifier 5 | import milk.supervised.multi 6 | from milksets.yeast import load 7 | import numpy as np 8 | 9 | def test_ecoc_learner(): 10 | base = milk.tests.fast_classifier.fast_classifier() 11 | learner = milk.supervised.multi.ecoc_learner(base) 12 | features, labels = load() 13 | nlabels = len(set(labels)) 14 | model = learner.train(features[::2],labels[::2]) 15 | 16 | testl = np.array(model.apply_many(features[1::2])) 17 | assert np.mean(testl == labels[1::2]) > 1./nlabels 18 | assert testl.min() >= 0 19 | assert testl.max() < nlabels 20 | 21 | # This failed at one point: 22 | learner = ecoc_learner(svm.svm_to_binary(svm.svm_raw(kernel=svm.dot_kernel(), C=1.))) 23 | model = learner.train(features[:200], labels[:200]) 24 | assert (model is not None) 25 | 26 | def test_ecoc_probability(): 27 | features,labels = load() 28 | features = features[labels < 5] 29 | labels = labels[labels < 5] 30 | raw = svm.svm_raw(kernel=svm.dot_kernel(), C=1.) 31 | base = ctransforms(raw, svm.svm_sigmoidal_correction()) 32 | learner = ecoc_learner(base, probability=True) 33 | model = learner.train(features[::2], labels[::2]) 34 | results = list(map(model.apply, features[1::2])) 35 | results = np.array(results) 36 | assert results.shape[1] == len(set(labels)) 37 | assert np.mean(results.argmax(1) == labels[1::2]) > .5 38 | -------------------------------------------------------------------------------- /milk/tests/test_ext_jugparallel.py: -------------------------------------------------------------------------------- 1 | try: 2 | import jug 3 | from jug import value 4 | import jug.options 5 | from jug.tests.utils import task_reset, simple_execute 6 | except ImportError: 7 | from nose import SkipTest 8 | def task_reset(f): 9 | def g(): 10 | raise SkipTest() 11 | return g 12 | 13 | @task_reset 14 | def test_nfoldcrossvalidation(): 15 | store, space = jug.jug.init('milk/tests/data/jugparallel_jugfile.py', 'dict_store') 16 | simple_execute() 17 | assert len(jug.value(space['classified'])) == 2 18 | assert len(jug.value(space['classified_wpred'])) ==3 19 | 20 | 21 | @task_reset 22 | def test_kmeans(): 23 | store, space = jug.jug.init('milk/tests/data/jugparallel_kmeans_jugfile.py', 'dict_store') 24 | simple_execute() 25 | assert len(value(space['clustered'])) == 2 26 | -------------------------------------------------------------------------------- /milk/tests/test_featureselection.py: -------------------------------------------------------------------------------- 1 | import milk.supervised.featureselection 2 | from milk.supervised.featureselection import select_n_best, rank_corr 3 | import numpy as np 4 | def test_sda(): 5 | from milksets import wine 6 | features, labels = wine.load() 7 | selected = milk.supervised.featureselection.sda(features,labels) 8 | for sel in selected: 9 | assert sel <= features.shape[1] 10 | 11 | def test_linear_independent_features(): 12 | np.random.seed(122) 13 | X3 = np.random.rand(20,3) 14 | X = np.c_[X3,X3*2+np.random.rand(20,3)/20.,-X3*2+np.random.rand(20,3)/10.] 15 | X2 = np.c_[X3,X3*2,-X3*3e-3] 16 | assert len(milk.supervised.featureselection.linear_independent_features(X)) == 9 17 | assert len(milk.supervised.featureselection.linear_independent_features(X2)) == 3 18 | assert np.all (np.sort(milk.supervised.featureselection.linear_independent_features(X2) % 3) == np.arange(3)) 19 | 20 | def _rank(A,tol=1e-8): 21 | s = np.linalg.svd(A,compute_uv=0) 22 | return (s > tol).sum() 23 | 24 | def _slow_linear_independent_features(featmatrix): 25 | ''' 26 | Returns the indices of a set of linearly independent features (columns). 27 | 28 | indices = linear_independent_features(features) 29 | ''' 30 | independent = [0,] 31 | rank = 1 32 | feat = [featmatrix[:,0]] 33 | for i,col in enumerate(featmatrix.T): 34 | feat.append(col) 35 | nrank = _rank(np.array(feat)) 36 | if nrank == rank: 37 | del feat[-1] 38 | else: 39 | rank = nrank 40 | independent.append(i) 41 | return np.array(independent) 42 | 43 | 44 | def test_select_n(): 45 | from milksets.wine import load 46 | 47 | features,labels = load() 48 | for n in (1,2,4,8): 49 | select = select_n_best(n, rank_corr) 50 | model = select.train(features,labels) 51 | f = model.apply(features[3]) 52 | assert len(f) == n 53 | 54 | def test_select_n(): 55 | from milksets.wine import load 56 | 57 | features,labels = load() 58 | for n in (1,2,4,8): 59 | select = select_n_best(n, rank_corr) 60 | model = select.train(features,labels) 61 | f = model.apply(features[3]) 62 | assert len(f) == n 63 | 64 | def slow_rank_corr(features, labels): 65 | features = np.asanyarray(features) 66 | labels = np.asanyarray(labels) 67 | binlabels = [(labels == ell) for ell in set(labels)] 68 | rs = [] 69 | for feat in features.T: 70 | ranks = feat.argsort() 71 | corrcoefs = [np.corrcoef(ranks, labs)[0,1] for labs in binlabels] 72 | corrcoefs = np.array(corrcoefs) 73 | corrcoefs **= 2 74 | rs.append(np.max(corrcoefs)) 75 | return np.array(rs) 76 | 77 | def test_compare_rank_corr(): 78 | from milksets.wine import load 79 | features,labels = load() 80 | r0 = rank_corr(features,labels) 81 | r1 = slow_rank_corr(features,labels) 82 | assert np.allclose(r0,r1) 83 | -------------------------------------------------------------------------------- /milk/tests/test_fisher.py: -------------------------------------------------------------------------------- 1 | import milk.supervised.svm 2 | import milk.supervised.normalise 3 | import numpy as np 4 | import milk.supervised.svm 5 | 6 | def _slow_f(features,labels,kernel_or_sigma): 7 | try: 8 | kernel = kernel_or_sigma 9 | kernel(features[0],features[1]) 10 | except: 11 | kernel = milk.supervised.svm.rbf_kernel(kernel_or_sigma) 12 | N1 = (labels == 0).sum() 13 | N2 = (labels == 1).sum() 14 | x1 = features[labels == 0] 15 | x2 = features[labels == 1] 16 | dm = 0 17 | for i in range(N1): 18 | for j in range(N1): 19 | dm += kernel(x1[i],x1[j])/N1/N1 20 | for i in range(N2): 21 | for j in range(N2): 22 | dm += kernel(x2[i],x2[j])/N2/N2 23 | for i in range(N1): 24 | for j in range(N2): 25 | dm -= 2*kernel(x1[i],x2[j])/N1/N2 26 | s1 = N1 27 | for i in range(N1): 28 | for j in range(N1): 29 | s1 -= kernel(x1[i],x1[j])/N1 30 | s2 = N2 31 | for i in range(N2): 32 | for j in range(N2): 33 | s2 -= kernel(x2[i],x2[j])/N2 34 | return (s1 + s2)/dm 35 | 36 | 37 | def test_fisher_approx(): 38 | from milksets import wine 39 | features,labels = wine.load() 40 | f = milk.supervised.svm.sigma_value_fisher(features,labels) 41 | for sigma in (2.**-4,2.,16.,32.): 42 | assert abs(f(sigma) - _slow_f(features,labels,sigma)) < 1e-6 43 | -------------------------------------------------------------------------------- /milk/tests/test_gaussianmixture.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from milk.unsupervised import gaussianmixture 3 | 4 | def _sq(x): 5 | return x*x 6 | def test_gm(): 7 | np.random.seed(22) 8 | centroids = np.repeat(np.arange(4), 4).reshape((4,4)) 9 | fmatrix = np.concatenate([(np.random.randn(12,4)+c) for c in centroids]) 10 | assignments = np.repeat(np.arange(4), 12) 11 | rss = sum(np.sum(_sq(fmatrix[i*12:(i+1)*12]-i)) for i in range(4)) 12 | assert np.abs(gaussianmixture.residual_sum_squares(fmatrix, assignments, centroids) - rss) < 1.e-12 13 | assert gaussianmixture.BIC(fmatrix, assignments, centroids) > 0 14 | assert gaussianmixture.AIC(fmatrix, assignments, centroids) > 0 15 | 16 | assert gaussianmixture.BIC(fmatrix, assignments, centroids, model='full_covariance') > \ 17 | gaussianmixture.BIC(fmatrix, assignments, centroids, model='diagonal_covariance') > \ 18 | gaussianmixture.BIC(fmatrix, assignments, centroids, model='one_variance') 19 | 20 | assert gaussianmixture.AIC(fmatrix, assignments, centroids, model='full_covariance') > \ 21 | gaussianmixture.AIC(fmatrix, assignments, centroids, model='diagonal_covariance') > \ 22 | gaussianmixture.AIC(fmatrix, assignments, centroids, model='one_variance') 23 | 24 | -------------------------------------------------------------------------------- /milk/tests/test_gridsearch.py: -------------------------------------------------------------------------------- 1 | import milk.supervised.gridsearch 2 | import milk.supervised.svm 3 | from milk.supervised.gridsearch import gridminimise, _allassignments, gridsearch 4 | from milk.tests.fast_classifier import fast_classifier 5 | from nose.tools import raises 6 | import numpy as np 7 | 8 | 9 | def slow_gridminimise(learner, features, labels, params, measure=None): 10 | from ..measures.nfoldcrossvalidation import nfoldcrossvalidation 11 | if measure is None: 12 | measure = np.trace 13 | 14 | best_val = initial_value 15 | best = None 16 | for assignement in _allassignments(params): 17 | _set_assignment(learner, assignement) 18 | S,_ = nfoldcrossvalidation(features, labels, classifier=learner) 19 | cur = measure(S) 20 | if cur > best_val: 21 | best = assignement 22 | best_val = cur 23 | return best 24 | 25 | 26 | def test_gridsearch(): 27 | from milksets import wine 28 | features, labels = wine.load() 29 | selected = (labels < 2) 30 | features = features[selected] 31 | labels = labels[selected] 32 | 33 | G = milk.supervised.gridsearch( 34 | milk.supervised.svm.svm_raw(), 35 | params={'C':[.01,.1,1.,10.], 36 | 'kernel':[milk.supervised.svm.rbf_kernel(0.1),milk.supervised.svm.rbf_kernel(1.)] 37 | }) 38 | model = G.train(features,labels) 39 | reslabels = [model.apply(f) for f in features] 40 | assert len(reslabels) == len(features) 41 | test_gridsearch.slow = True 42 | 43 | 44 | def test_all_assignements(): 45 | assert len(list(_allassignments({'C': [0,1], 'kernel' : ['a','b','c']}))) == 2 * 3 46 | 47 | class error_learner(object): 48 | def train(self, features, labels, **kwargs): 49 | raise ValueError('oops') 50 | 51 | def set_option(self, k, v): 52 | pass 53 | 54 | @raises(Exception) 55 | def test_with_error(): 56 | from milksets.wine import load 57 | features, labels = load() 58 | learner = error_learner() 59 | G = milk.supervised.gridsearch( 60 | error_learner(), 61 | params = { 'error' : list(range(3)), 'error2' : list(range(5)) } 62 | ) 63 | G.train(features,labels) 64 | 65 | 66 | class simple_model: 67 | def __init__(self, c): 68 | self.c = c 69 | def apply(self, f): 70 | return self.c 71 | 72 | def f(a,b,c): 73 | return a**2 + b**3 + c 74 | 75 | class simple_learner: 76 | def set_option(self, k, v): 77 | setattr(self, k, v) 78 | def train(self, fs, ls, normalisedlabels=False): 79 | return simple_model(f(self.a, self.b, self.c)) 80 | 81 | def test_gridminimise(): 82 | features = np.arange(100) 83 | labels = np.tile((0,1), 50) 84 | paramspace = { 'a': np.arange(4), 'b' : np.arange(-3,3), 'c' : np.linspace(2., 10) } 85 | best,value = gridminimise(simple_learner(), features, labels, paramspace, measure=(lambda _, p: p[0]), return_value=True) 86 | best = dict(best) 87 | val = f(best['a'], best['b'], best['c']) 88 | assert value == val*100 89 | for a in np.arange(4): 90 | for b in np.arange(-3,3): 91 | for c in np.linspace(2., 10): 92 | assert val <= f(a,b,c) 93 | gs = gridsearch(simple_learner(), paramspace, measure=(lambda _, p: p[0]), annotate=True) 94 | model = gs.train(features, labels) 95 | assert model.value == value 96 | assert model.arguments == val 97 | 98 | def test_gridminimise(): 99 | from milksets.wine import load 100 | features, labels = load() 101 | x = gridminimise(milk.supervised.svm_simple(kernel=np.dot, C=2.), features[::2], labels[::2] == 0, {'C' : (0.5,) }) 102 | cval, = x 103 | assert cval == ('C', .5) 104 | 105 | def test_gridminimise_return(): 106 | from milksets.wine import load 107 | features,labels = load() 108 | learner = fast_classifier() 109 | gridminimise(learner, features, labels, { 'ignore' : [0] }) 110 | _,error = gridminimise(learner, features, labels, { 'ignore' : [0] }, return_value=True, nfolds=5) 111 | cmat,_ = milk.nfoldcrossvalidation(features, labels, learner=learner, nfolds=5) 112 | assert error == cmat.sum()-cmat.trace() 113 | -------------------------------------------------------------------------------- /milk/tests/test_grouped.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import milk.supervised.svm 3 | from milk.supervised.svm import rbf_kernel 4 | import milk.supervised.multi 5 | import milk.supervised.grouped 6 | from milk.supervised.classifier import ctransforms 7 | import milksets.wine 8 | 9 | def group(features, labels, step): 10 | N = len(labels) 11 | i = 0 12 | gfeatures = [] 13 | glabels = [] 14 | while i < N: 15 | next = i + step 16 | while next > N or labels[next-1] != labels[i]: next -= 1 17 | gfeatures.append(features[i:next]) 18 | glabels.append(labels[i]) 19 | i = next 20 | return gfeatures, glabels 21 | 22 | 23 | 24 | def test_voting(): 25 | base = ctransforms(milk.supervised.svm.svm_raw(C=2.,kernel=milk.supervised.svm.rbf_kernel(2.**-3)),milk.supervised.svm.svm_binary()) 26 | base = milk.supervised.multi.one_against_rest(base) 27 | features,labels = milksets.wine.load() 28 | gfeatures, glabels = group(features, labels, 3) 29 | 30 | learner = milk.supervised.grouped.voting_classifier(base) 31 | learner.train(gfeatures, glabels) 32 | model = learner.train(gfeatures, glabels) 33 | assert ([model.apply(f) for f in gfeatures] == np.array(glabels)).mean() > .8 34 | 35 | 36 | def test_filter_outliers(): 37 | np.random.seed(22) 38 | features = [np.random.randn(10,10) for i in range(20)] 39 | for f in features: 40 | f[0] *= 10 41 | 42 | trainer = milk.supervised.grouped.filter_outliers(.9) 43 | model = trainer.train(features, [0] * len(features)) 44 | for f in features: 45 | ff = model.apply(f) 46 | assert np.all(ff == f[1:]) 47 | 48 | 49 | 50 | def test_nfoldcrossvalidation(): 51 | np.random.seed(22) 52 | features = np.array([np.random.rand(8+(i%3), 12)*(i//20) for i in range(40)], dtype=object) 53 | labels = np.zeros(40, int) 54 | labels[20:] = 1 55 | classifier = milk.supervised.grouped.voting_classifier(milk.supervised.svm_simple(C=1., kernel=rbf_kernel(1./12))) 56 | cmat, names = milk.nfoldcrossvalidation(features, labels, classifier=classifier) 57 | assert cmat.shape == (2,2) 58 | assert sorted(names) == list(range(2)) 59 | 60 | 61 | 62 | class identity_classifier(object): 63 | def train(self, features, labels): 64 | return identity_model() 65 | 66 | class identity_model(object): 67 | def apply(self, f): 68 | return f 69 | 70 | 71 | def test_meanclassif(): 72 | gfeatures = [np.arange(10), np.arange(10)%2] 73 | glabels = [0,1] 74 | meanclassif = milk.supervised.grouped.mean_classifier(identity_classifier()) 75 | model = meanclassif.train(gfeatures, glabels) 76 | assert model.apply(gfeatures[0]) == np.arange(10).mean() 77 | assert model.apply(gfeatures[1]) == .5 78 | 79 | -------------------------------------------------------------------------------- /milk/tests/test_kmeans.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import milk.unsupervised 3 | from milk.unsupervised.kmeans import assign_centroids, repeated_kmeans 4 | 5 | def test_kmeans(): 6 | np.random.seed(132) 7 | features = np.r_[np.random.rand(20,3)-.5,.5+np.random.rand(20,3)] 8 | def test_distance(dist, kwargs={}): 9 | centroids, _ = milk.unsupervised.kmeans(features, 2, distance=dist, **kwargs) 10 | positions = [0]*20 + [1]*20 11 | correct = (centroids == positions).sum() 12 | assert correct >= 38 or correct <= 2 13 | yield test_distance, 'euclidean' 14 | yield test_distance, 'seuclidean' 15 | yield test_distance, 'mahalanobis', { 'icov' : np.eye(3) } 16 | 17 | def test_kmeans_centroids(): 18 | np.random.seed(132) 19 | features = np.random.rand(201,30) 20 | for k in [2,3,5,10]: 21 | indices,centroids = milk.unsupervised.kmeans(features, k) 22 | for i in range(k): 23 | if np.any(indices == i): 24 | assert np.allclose(centroids[i], features[indices == i].mean(0)) 25 | 26 | 27 | def test_assign_cids(): 28 | from milksets.wine import load 29 | features,_ = load() 30 | assigns, centroids = milk.unsupervised.kmeans(features, 3, R=2, max_iters=10) 31 | assert np.all(assign_centroids(features, centroids) == assigns) 32 | 33 | def test_non_contiguous_fmatrix(): 34 | from milksets.wine import load 35 | features,_ = load() 36 | features = features[:,::2] 37 | assigns, centroids = milk.unsupervised.kmeans(features, 3, R=2, max_iters=10) 38 | assert np.all(assign_centroids(features, centroids) == assigns) 39 | 40 | features = features.astype(np.int32) 41 | assigns, centroids = milk.unsupervised.kmeans(features, 3, R=2, max_iters=10) 42 | assert np.all(assign_centroids(features, centroids) == assigns) 43 | 44 | 45 | def test_repeated_kmeans(): 46 | np.random.seed(132) 47 | features = np.random.rand(201,30) 48 | cids,cs = repeated_kmeans(features, 3, 2) 49 | assert len(cids) == len(features) 50 | 51 | def test_kmeans_return_partial(): 52 | np.random.seed(132) 53 | features = np.r_[np.random.rand(20,3)-.5,.5+np.random.rand(20,3)] 54 | assignments,centroids = milk.unsupervised.kmeans(features, 2, R=129) 55 | centroids_ = milk.unsupervised.kmeans(features, 2, R=129, return_assignments=False) 56 | assignments_ = milk.unsupervised.kmeans(features, 2, R=129, return_centroids=False) 57 | assert np.all(centroids == centroids_) 58 | assert np.all(assignments == assignments_) 59 | 60 | 61 | 62 | def test_kmeans_all_equal(): 63 | import milk.unsupervised._kmeans 64 | np.random.seed(132) 65 | for _ in range(8): 66 | a = (np.random.random(1024*128)*250).astype(int) 67 | b = a.copy() 68 | assert milk.unsupervised._kmeans.are_equal(a,b) 69 | a[3435] += 1 70 | assert not milk.unsupervised._kmeans.are_equal(a,b) 71 | -------------------------------------------------------------------------------- /milk/tests/test_knn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import milk.supervised.knn 3 | 4 | def test_simple(): 5 | X=np.array([ 6 | [0,0,0], 7 | [1,1,1], 8 | ]) 9 | Y=np.array([ 1, -1 ]) 10 | kNN = milk.supervised.knn.kNN(1) 11 | kNN = kNN.train(X,Y) 12 | assert kNN.apply(X[0]) == Y[0] 13 | assert kNN.apply(X[1]) == Y[1] 14 | assert kNN.apply([0,0,1]) == Y[0] 15 | assert kNN.apply([0,1,1]) == Y[1] 16 | 17 | def test_nnclassifier(): 18 | labels=[0,1] 19 | data=[[0.,0.],[1.,1.]] 20 | C = milk.supervised.knn.kNN(1) 21 | model = C.train(data,labels) 22 | assert model.apply(data[0]) == 0 23 | assert model.apply(data[1]) == 1 24 | assert model.apply([.01,.01]) == 0 25 | assert model.apply([.99,.99]) == 1 26 | assert model.apply([100,100]) == 1 27 | assert model.apply([-100,-100]) == 0 28 | assert model.apply([.9,.9]) == 1 29 | middle = model.apply([.5,.5]) 30 | assert (middle == 0) or (middle == 1) 31 | 32 | def test_approx_nnclassifier(): 33 | import milksets.wine 34 | features,labels = milksets.wine.load() 35 | for k in (1,3,5): 36 | learner = milk.supervised.knn.approximate_knn_learner(k) 37 | model = learner.train(features[::2], labels[::2]) 38 | testing = model.apply_many(features[1::2]) 39 | assert np.mean(testing == labels[1::2]) > .5 40 | -------------------------------------------------------------------------------- /milk/tests/test_lasso.py: -------------------------------------------------------------------------------- 1 | from milk.supervised.lasso import lasso_learner 2 | import milk.supervised.lasso 3 | import numpy as np 4 | 5 | def test_lasso_smoke(): 6 | np.random.seed(3) 7 | for i in range(8): 8 | X = np.random.rand(100,10) 9 | Y = np.random.rand(5,10) 10 | B = np.random.rand(5,100) 11 | before = np.linalg.norm(Y - np.dot(B,X)) 12 | B = milk.supervised.lasso(X,Y) 13 | after = np.linalg.norm(Y - np.dot(B,X)) 14 | assert after < before 15 | assert np.all(~np.isnan(B)) 16 | 17 | def test_lasso_nans(): 18 | np.random.seed(3) 19 | for i in range(8): 20 | X = np.random.rand(100,10) 21 | Y = np.random.rand(5,10) 22 | B = np.random.rand(5,100) 23 | for j in range(12): 24 | Y.flat[np.random.randint(0,Y.size-1)] = float('nan') 25 | B = milk.supervised.lasso(X,Y) 26 | assert np.all(~np.isnan(B)) 27 | 28 | def test_lam_zero(): 29 | np.random.seed(2) 30 | for i in range(8): 31 | X = np.random.rand(24,2) 32 | Y = np.random.rand(1,2) 33 | B = milk.supervised.lasso(X,Y, lam=0.0) 34 | R = Y - np.dot(B,X) 35 | R = R.ravel() 36 | assert np.dot(R,R) < .01 37 | 38 | 39 | def test_lasso_walk(): 40 | np.random.seed(5) 41 | for i in range(4): 42 | X = np.random.rand(100,10) 43 | Y = np.random.rand(5,10) 44 | Bs = milk.supervised.lasso_walk(X,Y, start=.0001, nr_steps=3) 45 | B0 = milk.supervised.lasso(X,Y, lam=.0001) 46 | assert np.all(Bs[0] == B0) 47 | assert not np.all(Bs[0] == Bs[-1]) 48 | assert len(Bs) == 3 49 | 50 | def test_lasso_walk_nans(): 51 | np.random.seed(5) 52 | for i in range(3): 53 | X = np.random.rand(100,10) 54 | Y = np.random.rand(5,10) 55 | B = np.random.rand(5,100) 56 | for j in range(12): 57 | Y.flat[np.random.randint(0,Y.size-1)] = float('nan') 58 | B = milk.supervised.lasso_walk(X,Y, nr_steps=6) 59 | assert np.all(~np.isnan(B)) 60 | 61 | 62 | def test_learner(): 63 | np.random.seed(334) 64 | learner = lasso_learner() 65 | X = np.random.rand(100,10) 66 | Y = np.random.rand(5,10) 67 | model = learner.train(X,Y) 68 | test = model.apply(np.random.rand(100)) 69 | assert len(test) == len(Y) 70 | -------------------------------------------------------------------------------- /milk/tests/test_logistic.py: -------------------------------------------------------------------------------- 1 | import milk.supervised.logistic 2 | import milksets.wine 3 | import numpy as np 4 | def test_better_than_random(): 5 | learner = milk.supervised.logistic.logistic_learner() 6 | features, labels = milksets.wine.load() 7 | model = learner.train(features, labels == 0) 8 | error = np.array([np.abs(model.apply(f)-(l == 0)) 9 | for f,l in zip(features, labels)]) 10 | assert error.mean() < .1 11 | -------------------------------------------------------------------------------- /milk/tests/test_measures.py: -------------------------------------------------------------------------------- 1 | import milk.measures.measures 2 | import milk.measures.curves 3 | import numpy as np 4 | import numpy 5 | from milk.measures import accuracy, waccuracy, bayesian_significance 6 | 7 | def test_100(): 8 | C=numpy.zeros((2,2)) 9 | C[0,0]=100 10 | C[1,1]=50 11 | assert accuracy(C) == 1. 12 | assert waccuracy(C) == 1. 13 | 14 | def test_0(): 15 | C = numpy.array([ 16 | [0, 10], 17 | [10, 0] 18 | ]) 19 | assert waccuracy(C) == 0. 20 | assert accuracy(C) == 0. 21 | 22 | def test_50(): 23 | C = numpy.array([ 24 | [10, 10], 25 | [10, 10] 26 | ]) 27 | assert accuracy(C) == .5 28 | assert waccuracy(C) == .5 29 | 30 | def test_unbalanced(): 31 | C = numpy.array([ 32 | [20, 10], 33 | [10, 0] 34 | ]) 35 | assert accuracy(C) == .5 36 | assert waccuracy(C) == 1./3 37 | 38 | 39 | 40 | def test_confusion_matrix(): 41 | np.random.seed(323) 42 | labels0 = np.arange(101)%3 43 | labels1 = (labels0 + np.random.rand(101)*2).astype(np.int) % 3 44 | cmat = milk.measures.measures.confusion_matrix(labels0, labels1) 45 | for i in range(3): 46 | for j in range(3): 47 | assert cmat[i,j] == np.sum( (labels0 == i) & (labels1 == j) ) 48 | 49 | 50 | 51 | def test_significance(): 52 | assert np.allclose(.5, [bayesian_significance(1024,i,i) for i in range(0, 1025, 3)]) 53 | 54 | 55 | def test_roc(): 56 | np.random.seed(3) 57 | for i in range(4): 58 | labels = np.repeat([False,True], 50) 59 | response = labels + np.random.random(100)*i 60 | P,R = milk.measures.curves.roc(response, labels != 0) 61 | assert P.min() >= 0. 62 | assert R.min() >= 0. 63 | assert P.max() <= 1. 64 | assert R.max() <= 1. 65 | -------------------------------------------------------------------------------- /milk/tests/test_measures_clusters.py: -------------------------------------------------------------------------------- 1 | import milk.measures.cluster_agreement 2 | import numpy as np 3 | def test_rand_arand_jaccard(): 4 | np.random.seed(33) 5 | 6 | labels = np.repeat(np.arange(4),10) 7 | clusters = np.repeat(np.arange(4),10) 8 | 9 | a0,b0,c0= milk.measures.cluster_agreement.rand_arand_jaccard(clusters, labels) 10 | assert a0 == 1. 11 | assert b0 == 1. 12 | 13 | np.random.shuffle(clusters) 14 | a1,b1,c1= milk.measures.cluster_agreement.rand_arand_jaccard(clusters, labels) 15 | assert a1 >= 0. 16 | assert a1 < 1. 17 | assert b1 < 1. 18 | assert b1 >= 0. 19 | assert c1 < c0 20 | 21 | -------------------------------------------------------------------------------- /milk/tests/test_multi.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import milk.supervised.svm 4 | import milk.supervised.multi 5 | from milk.supervised.classifier import ctransforms 6 | from .fast_classifier import fast_classifier 7 | 8 | import milksets.wine 9 | features,labels = milksets.wine.load() 10 | A = np.arange(len(features)) 11 | random.seed(9876543210) 12 | random.shuffle(A) 13 | features = features[A] 14 | labels = labels[A] 15 | labelset = set(labels) 16 | base = ctransforms(milk.supervised.svm.svm_raw(C=2.,kernel=milk.supervised.svm.rbf_kernel(2.**-3)),milk.supervised.svm.svm_binary()) 17 | 18 | def test_one_against_rest(): 19 | M = milk.supervised.multi.one_against_rest(base) 20 | M = M.train(features[:100,:],labels[:100]) 21 | tlabels = [M.apply(f) for f in features[100:]] 22 | for tl in tlabels: 23 | assert tl in labelset 24 | 25 | def test_one_against_one(): 26 | M = milk.supervised.multi.one_against_one(base) 27 | M = M.train(features[:100,:],labels[:100]) 28 | tlabels = [M.apply(f) for f in features[100:]] 29 | for tl in tlabels: 30 | assert tl in labelset 31 | tlabels_many = M.apply_many(features[100:]) 32 | assert np.all(tlabels == tlabels_many) 33 | 34 | def test_two_thirds(): 35 | np.random.seed(2345) 36 | C = milk.supervised.defaultclassifier('fast') 37 | X = np.random.rand(120,4) 38 | X[:40] += np.random.rand(40,4) 39 | X[:40] += np.random.rand(40,4) 40 | X[40:80] -= np.random.rand(40,4) 41 | X[40:80] -= np.random.rand(40,4) 42 | Y = np.repeat(np.arange(3), 40) 43 | model = C.train(X,Y) 44 | Y_ = np.array([model.apply(x) for x in X]) 45 | assert (Y_ == Y).mean() * 3 > 2 46 | 47 | def test_multi_labels(): 48 | clabels = [[lab, lab+7] for lab in labels] 49 | multi_label = milk.supervised.multi.one_against_rest_multi(base) 50 | model = multi_label.train(features[::2], clabels[::2]) 51 | test_vals = [model.apply(f) for f in features[1::2]] 52 | for ts in test_vals: 53 | if 0.0 in ts: assert 7.0 in ts 54 | if 1.0 in ts: assert 8.0 in ts 55 | if 2.0 in ts: assert 9.0 in ts 56 | 57 | 58 | def test_classifier_no_set_options(): 59 | # Basically these should not raise an exception 60 | milk.supervised.multi.one_against_rest_multi(fast_classifier()) 61 | milk.supervised.multi.one_against_rest(fast_classifier()) 62 | milk.supervised.multi.one_against_one(fast_classifier()) 63 | 64 | 65 | def test_tree(): 66 | mtree = milk.supervised.multi.multi_tree_learner(fast_classifier()) 67 | labels = [0,1,2,2,3,3,3,3] 68 | features = np.random.random_sample((len(labels), 8)) 69 | model = mtree.train(features, labels) 70 | counts = np.zeros(4) 71 | for ell in labels: 72 | counts[ell] += 1 73 | 74 | g0,g1 = milk.supervised.multi.split(counts) 75 | assert np.all(g0 == [3]) or np.all(g1 == [3]) 76 | def list_to_zero(v): 77 | if isinstance(v, list): 78 | return 1000 79 | return v 80 | def r(m): 81 | if len(m) == 1: return int(m[0]) 82 | else: return sorted([r(m[1]), r(m[2])], key=list_to_zero) 83 | assert r(model.model) == [3,[2,[0,1]]] 84 | 85 | -------------------------------------------------------------------------------- /milk/tests/test_multi_label.py: -------------------------------------------------------------------------------- 1 | from milk.tests.fast_classifier import fast_classifier 2 | import milk.supervised.multi_label 3 | import milk 4 | import numpy as np 5 | 6 | def test_one_by_one(): 7 | np.random.seed(23) 8 | r = np.random.random 9 | ps = np.array([.7,.5,.8,.3,.8]) 10 | learner = milk.supervised.multi_label.one_by_one(fast_classifier()) 11 | universe = list(range(len(ps))) 12 | 13 | for _ in range(10): 14 | labels = [] 15 | features = [] 16 | bases = [np.random.rand(20) for pj in ps] 17 | for i in range(256): 18 | cur = [] 19 | curf = np.zeros(20,float) 20 | for j,pj in enumerate(ps): 21 | if r() < pj: 22 | cur.append(j) 23 | curf += r()*bases[j] 24 | if not cur: continue 25 | labels.append(cur) 26 | features.append(curf) 27 | 28 | model = learner.train(features, labels) 29 | predicted = model.apply_many(features) 30 | matrix = np.zeros((2,2), int) 31 | for t,p in zip(labels, predicted): 32 | for ell in universe: 33 | row = (ell in t) 34 | col = (ell in p) 35 | matrix[row,col] += 1 36 | Tn,Fp = matrix[0] 37 | Fn,Tp = matrix[1] 38 | prec = Tp/float(Tp+Fp) 39 | recall = Tp/float(Tp+Fn) 40 | F1 = 2*prec*recall/(prec + recall) 41 | assert F1 > .3 42 | -------------------------------------------------------------------------------- /milk/tests/test_multi_view.py: -------------------------------------------------------------------------------- 1 | import milk.supervised.multi_view 2 | import numpy as np 3 | import milk.supervised.svm 4 | from milk.supervised.defaultclassifier import feature_selection_simple 5 | 6 | def test_multi_view(): 7 | from milksets.wine import load 8 | features, labels = load() 9 | features0 = features[::10] 10 | features1 = features[1::10] 11 | features2 = features[2::10] 12 | labels0 = labels[::10] 13 | labels1 = labels[1::10] 14 | labels2 = labels[2::10] 15 | 16 | assert np.all(labels0 == labels1) 17 | assert np.all(labels1 == labels2) 18 | labels = labels0 19 | train_features = list(zip(features0,features1,features2)) 20 | test_features = list(zip(features[3::10], features[4::10], features[5::10])) 21 | base = milk.supervised.classifier.ctransforms( 22 | feature_selection_simple(), 23 | milk.supervised.svm.svm_raw(C=128, kernel=milk.supervised.svm.rbf_kernel(4.)), 24 | milk.supervised.svm.svm_sigmoidal_correction() 25 | ) 26 | classifier = milk.supervised.multi_view.multi_view_classifier([base,base,base]) 27 | model = classifier.train(train_features, labels == 0) 28 | assert ([model.apply(f) for f in test_features] == (labels == 0)).mean() > .9 29 | -------------------------------------------------------------------------------- /milk/tests/test_nnmf.py: -------------------------------------------------------------------------------- 1 | import milk.unsupervised 2 | import numpy as np 3 | def test_nnmf(): 4 | def test3(method): 5 | np.random.seed(8) 6 | X3 = np.random.rand(20,3) 7 | X = np.c_[ X3, 8 | X3*2+np.random.rand(20,3)/20., 9 | -X3*2+np.random.rand(20,3)/10.] 10 | W,V = method(X, 3, R=7) 11 | assert np.sum((np.dot(W,V)-X)**2)/np.sum(X**2) < .5 12 | 13 | yield test3, milk.unsupervised.lee_seung 14 | yield test3, milk.unsupervised.sparse_nnmf 15 | 16 | def test_sparse_nnmf(): 17 | # This is really just a smoke test because the test case is not sparse!! 18 | from milk.unsupervised import sparse_nnmf 19 | np.random.seed(8) 20 | X3 = np.random.rand(20,3) 21 | X = np.c_[ X3, 22 | X3*2+np.random.rand(20,3)/20., 23 | -X3*2+np.random.rand(20,3)/10.] 24 | W,V = sparse_nnmf(X, 3, sparsenessW=.7, sparsenessH=.7, R=7) 25 | assert not np.any(np.isnan(W)) 26 | assert not np.any(np.isnan(V)) 27 | error = np.dot(W,V)-X 28 | assert error.var() < X.var() 29 | 30 | 31 | 32 | def test_hoyer_project(): 33 | from milk.unsupervised.nnmf.hoyer import _L1for, _project 34 | def sp(n, L1, L2): 35 | return (np.sqrt(n) - L1/L2)/(np.sqrt(n) - 1) 36 | sparseness = .6 37 | n = 9. 38 | row = np.arange(int(n))/n 39 | L2 = np.sqrt(np.dot(row, row)) 40 | L1 = _L1for(sparseness, row, L2) 41 | 42 | assert np.abs(sp(n, L1, L2) - sparseness) < 1.e-4 43 | row_ = _project(row, L1, L2) 44 | assert not np.any(np.isnan(row_)) 45 | assert np.all(row_ >= 0) 46 | 47 | L2 = np.sqrt(np.dot(row, row)) 48 | L1 = np.sum(np.abs(row_)) 49 | res = sp(n, L1, L2) 50 | assert np.abs(res - sparseness) < 1.e-4 51 | 52 | -------------------------------------------------------------------------------- /milk/tests/test_normalise.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2012, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | from __future__ import division 24 | import numpy 25 | import numpy as np 26 | from milk.supervised.normalise import sample_to_2min 27 | import milk.supervised.normalise 28 | 29 | 30 | def test_zscore_normalise(): 31 | I=milk.supervised.normalise.zscore_normalise() 32 | numpy.random.seed(1234) 33 | features = numpy.random.rand(20,100) 34 | L = numpy.zeros(100) 35 | model = I.train(features, L) 36 | transformed = np.array([model.apply(f) for f in features]) 37 | assert np.all( transformed.mean(0)**2 < 1e-7 ) 38 | assert np.all( np.abs(transformed.std(0) - 1) < 1e-3 ) 39 | 40 | 41 | def test_sample_to_2min(): 42 | A = np.zeros(256, np.int32) 43 | def test_one(A): 44 | selected = sample_to_2min(A) 45 | ratios = [] 46 | for l0 in set(A): 47 | for l1 in set(A): 48 | ratios.append( (A[selected] == l0).sum() / (A[selected] == l1).sum() ) 49 | assert np.max(ratios) <= 2.001 50 | A[20:] = 1 51 | yield test_one, A 52 | 53 | A[21:] = 1 54 | yield test_one, A 55 | 56 | A[129:] = 2 57 | yield test_one, A 58 | 59 | def test_sample_to_2min_list(): 60 | from collections import defaultdict 61 | def count(xs): 62 | counts = defaultdict(int) 63 | for x in xs: 64 | counts[x] += 1 65 | return counts 66 | labels = ["A"]*8 + ["B"]*12 + ["C"]*16 + ["D"] * 24 + ["E"] * 1000 67 | selected = sample_to_2min(labels) 68 | before = count(labels) 69 | after = count(np.array(labels)[selected]) 70 | assert max(after.values()) == min(before.values())*2 71 | 72 | 73 | def test_interval_normalise(): 74 | interval = milk.supervised.normalise.interval_normalise() 75 | np.random.seed(105) 76 | features = np.random.randn(100, 5) 77 | model = interval.train(features, features[0] > 0) 78 | transformed = np.array([model.apply(f) for f in features]) 79 | assert np.allclose(transformed.min(0), -1) 80 | assert np.allclose(transformed.max(0), +1) 81 | 82 | 83 | 84 | def test_nanstd(): 85 | from milk.unsupervised.normalise import _nanstd 86 | np.random.seed(234) 87 | for i in range(8): 88 | x = np.random.rand(200,231) 89 | np.allclose(_nanstd(x,0), x.std(0)) 90 | np.allclose(_nanstd(x,1), x.std(1)) 91 | -------------------------------------------------------------------------------- /milk/tests/test_normaliselabels.py: -------------------------------------------------------------------------------- 1 | from milk.supervised.normalise import normaliselabels 2 | import numpy as np 3 | 4 | def test_normaliselabels(): 5 | np.random.seed(22) 6 | labels = np.zeros(120, np.uint8) 7 | labels[40:] += 1 8 | labels[65:] += 1 9 | reorder = np.argsort(np.random.rand(len(labels))) 10 | labels = labels[reorder] 11 | labels2,names = normaliselabels(labels) 12 | for new_n,old_n in enumerate(names): 13 | assert np.all( (labels == old_n) == (labels2 == new_n) ) 14 | 15 | def test_normaliselabels_multi(): 16 | np.random.seed(30) 17 | r = np.random.random 18 | for v in range(10): 19 | labels = [] 20 | p = np.array([.24,.5,.1,.44]) 21 | for i in range(100): 22 | cur = [j for j in range(4) if r() < p[j]] 23 | if not cur: cur = [0] 24 | labels.append(cur) 25 | nlabels, names = normaliselabels(labels, True) 26 | assert len(labels) == len(nlabels) 27 | assert len(nlabels[0]) == max(list(map(max,labels)))+1 28 | assert nlabels.sum() == sum(map(len,labels)) 29 | 30 | -------------------------------------------------------------------------------- /milk/tests/test_parzen.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import milk.supervised.normalise 3 | from milk.supervised.parzen import get_parzen_rbf_loocv 4 | import numpy as np 5 | import milksets 6 | 7 | def _slow_parzen(features, labels, sigma): 8 | correct = 0 9 | N = len(features) 10 | labels = 2*labels - 1 11 | def kernel(fi, fj): 12 | return np.exp(-((fi-fj)**2).sum()/sigma) 13 | for i in range(N): 14 | C = 0. 15 | for j in range(N): 16 | if i == j: continue 17 | C += labels[j] * kernel(features[i],features[j]) 18 | if (C*labels[i] > 0): correct += 1 19 | return correct/N 20 | 21 | def test_parzen(): 22 | features,labels = milksets.wine.load() 23 | labels = (labels == 1) 24 | features = milk.supervised.normalise.zscore(features) 25 | f = get_parzen_rbf_loocv(features, labels) 26 | sigmas = 2.**np.arange(-4,4) 27 | for s in sigmas: 28 | assert abs(_slow_parzen(features, labels, s) - f(s)) < 1e-6 29 | -------------------------------------------------------------------------------- /milk/tests/test_pca.py: -------------------------------------------------------------------------------- 1 | import numpy.random 2 | import milk.unsupervised.pca 3 | import numpy as np 4 | 5 | def test_pca(): 6 | numpy.random.seed(123) 7 | X = numpy.random.rand(10,4) 8 | X[:,1] += numpy.random.rand(10)**2*X[:,0] 9 | X[:,1] += numpy.random.rand(10)**2*X[:,0] 10 | X[:,2] += numpy.random.rand(10)**2*X[:,0] 11 | Y,V = milk.unsupervised.pca(X) 12 | Xn = milk.unsupervised.normalise.zscore(X) 13 | assert X.shape == Y.shape 14 | assert ((np.dot(V[:4].T,Y[:,:4].T).T-Xn)**2).sum()/(Xn**2).sum() < .3 15 | 16 | def test_mds(): 17 | from milk.unsupervised import pdist 18 | np.random.seed(232) 19 | for _ in range(12): 20 | features = np.random.random_sample((12,4)) 21 | X = milk.unsupervised.mds(features,4) 22 | D = pdist(features) 23 | D2 = pdist(X) 24 | assert np.mean( (D - D2) ** 2) < 10e-4 25 | 26 | 27 | def test_mds_dists(): 28 | from milk.unsupervised import pdist 29 | np.random.seed(232) 30 | for _ in range(12): 31 | features = np.random.random_sample((12,4)) 32 | D = pdist(features) 33 | X = milk.unsupervised.mds(features,4) 34 | X2 = milk.unsupervised.mds_dists(D, 4) 35 | assert np.mean( (X - X2) ** 2) < 10e-4 36 | 37 | 38 | 39 | def test_mds_list(): 40 | from milk.unsupervised.pca import mds 41 | data = np.random.random((128,16)) 42 | V = mds(data,2) 43 | V2 = mds(list(data),2) 44 | assert np.all(V == V2) 45 | 46 | def test_mds_regression_eig_order(): 47 | from milk.unsupervised.pca import mds_dists 48 | # This was part of a much larger computation, but this isolated the bug: 49 | dists = np.array([[ 50 | 0. , 377241.01101501, 390390.47006156, 51 | 340764.02535826, 421258.30020762, 470960.15365819, 52 | 331864.64507197, 213029.60122458, 306976.87583849], 53 | [ 377241.01101501, 0. , 159390.25449606, 54 | 140506.60640227, 140922.67044651, 221684.10621381, 55 | 130161.14561428, 224134.4629224 , 225617.6525412 ], 56 | [ 390390.47006156, 159390.25449606, 0. , 57 | 188417.11617804, 192114.58972062, 238026.3963446 , 58 | 159070.76483779, 242792.81436928, 228843.70200362], 59 | [ 340764.02535826, 140506.60640227, 188417.11617804, 60 | 0. , 247098.49216397, 265783.27794352, 61 | 161672.29500768, 170503.64299615, 171360.11464776], 62 | [ 421258.30020762, 140922.67044651, 192114.58972062, 63 | 247098.49216397, 0. , 246385.36543382, 64 | 153380.00248566, 276707.33890808, 276009.04198403], 65 | [ 470960.15365819, 221684.10621381, 238026.3963446 , 66 | 265783.27794352, 246385.36543382, 0. , 67 | 252609.80940353, 327987.54137854, 308492.70255307], 68 | [ 331864.64507197, 130161.14561428, 159070.76483779, 69 | 161672.29500768, 153380.00248566, 252609.80940353, 70 | 0. , 179275.66833105, 192598.94271197], 71 | [ 213029.60122458, 224134.4629224 , 242792.81436928, 72 | 170503.64299615, 276707.33890808, 327987.54137854, 73 | 179275.66833105, 0. , 117004.41340669], 74 | [ 306976.87583849, 225617.6525412 , 228843.70200362, 75 | 171360.11464776, 276009.04198403, 308492.70255307, 76 | 192598.94271197, 117004.41340669, 0. ]]) 77 | V = milk.unsupervised.mds_dists(dists, 2) 78 | assert V[:,1].ptp() > 1. 79 | -------------------------------------------------------------------------------- /milk/tests/test_pdist.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from milk.unsupervised import pdist, plike 3 | 4 | def test_pdist(): 5 | np.random.seed(222) 6 | X = np.random.randn(100,23) 7 | Y = np.random.randn(80,23) 8 | Dxx = pdist(X) 9 | for i in range(X.shape[0]): 10 | for j in range(X.shape[1]): 11 | assert np.allclose(Dxx[i,j], np.sum((X[i]-X[j])**2)) 12 | 13 | Dxy = pdist(X,Y) 14 | for i in range(X.shape[0]): 15 | for j in range(Y.shape[1]): 16 | assert np.allclose(Dxy[i,j], np.sum((X[i]-Y[j])**2)) 17 | Dxye = pdist(X, Y, 'euclidean') 18 | assert np.allclose(Dxye, np.sqrt(Dxy)) 19 | 20 | def test_plike(): 21 | np.random.seed(222) 22 | X = np.random.randn(100,23) 23 | Lxx = plike(X) 24 | assert len(Lxx) == len(Lxx.T) 25 | Lxx2 = plike(X, sigma2=.001) 26 | assert Lxx[0,1] != Lxx2[0,1] 27 | assert Lxx[0,0] == Lxx2[0,0] 28 | -------------------------------------------------------------------------------- /milk/tests/test_perceptron.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from milk.supervised.perceptron import perceptron_learner 3 | from milk.supervised import _perceptron 4 | from milksets.yeast import load 5 | 6 | def test_raw(): 7 | np.random.seed(23) 8 | data = np.random.random((100,10)) 9 | data[50:] += .5 10 | labels = np.repeat((0,1), 50) 11 | weights = np.zeros((11)) 12 | eta = 0.1 13 | for i in range(20): 14 | _perceptron.perceptron(data, labels, weights, eta) 15 | errs = _perceptron.perceptron(data, labels, weights, eta) 16 | assert errs < 10 17 | 18 | def test_wrapper(): 19 | features,labels = load() 20 | labels = (labels >= 5) 21 | 22 | learner = perceptron_learner() 23 | model = learner.train(features, labels) 24 | test = list(map(model.apply, features)) 25 | assert np.mean(labels != test) < .35 26 | -------------------------------------------------------------------------------- /milk/tests/test_precluster_learner.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from milk.supervised.precluster import precluster_learner, select_precluster 3 | from milk.tests.fast_classifier import fast_classifier 4 | 5 | def c0(): 6 | return np.random.rand(8) 7 | def c1(): 8 | return c0()+2.*np.ones(8) 9 | 10 | def gen_data(seed, with_nums=False): 11 | np.random.seed(seed) 12 | 13 | features = [] 14 | labels =[] 15 | for i in range(200): 16 | f = [] 17 | for j in range(40): 18 | use_0 = (i < 100 and j < 30) or (i >= 100 and j >= 30) 19 | if use_0: f.append(c0()) 20 | else: f.append(c1()) 21 | labels.append((i < 100)) 22 | if with_nums: 23 | features.append((f,[])) 24 | else: 25 | features.append(f) 26 | return features, labels 27 | 28 | 29 | def test_precluster(): 30 | learner = precluster_learner([2], base=fast_classifier(), R=12) 31 | features, labels = gen_data(22) 32 | model = learner.train(features,labels) 33 | 34 | assert model.apply([c0() for i in range(35)]) 35 | assert not model.apply([c1() for i in range(35)]) 36 | 37 | def test_codebook_learner(): 38 | learner = select_precluster([2,3,4], base=fast_classifier()) 39 | learner.rmax = 3 40 | features, labels = gen_data(23, 1) 41 | model = learner.train(features,labels) 42 | 43 | assert model.apply(([c0() for i in range(35)],[])) 44 | assert not model.apply(([c1() for i in range(35)],[])) 45 | 46 | def test_codebook_learner_case1(): 47 | learner = select_precluster([2], base=fast_classifier()) 48 | learner.rmax = 1 49 | features, labels = gen_data(23, 1) 50 | model = learner.train(features,labels) 51 | 52 | assert model.apply(([c0() for i in range(35)],[])) 53 | assert not model.apply(([c1() for i in range(35)],[])) 54 | 55 | -------------------------------------------------------------------------------- /milk/tests/test_regression.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import milk.supervised._svm 4 | from gzip import GzipFile 5 | from os import path 6 | from milksets.wine import load 7 | from milk.supervised import defaultclassifier 8 | import milk 9 | 10 | def test_svm_crash(): 11 | from sys import version_info 12 | if version_info.major >= 3: 13 | pickle_load = lambda f: pickle.load(f, encoding='latin1') 14 | else: 15 | pickle_load = pickle.load 16 | 17 | X,Y,kernel, C, eps ,tol, = pickle_load(GzipFile(path.dirname(__file__) + '/data/regression-2-Dec-2009.pp.gz')) 18 | X = X[2:-2,:].copy() 19 | Y = Y[2:-2].copy() 20 | N = len(Y) 21 | Y = Y.astype(np.int32) 22 | p = -np.ones(N,np.double) 23 | params = np.array([0,C,eps,tol],np.double) 24 | Alphas0 = np.zeros(N, np.double) 25 | cache_size = (1<<20) 26 | # The line below crashed milk: 27 | milk.supervised._svm.eval_LIBSVM(X,Y,Alphas0,p,params,kernel,cache_size) 28 | # HASN'T CRASHED! 29 | 30 | 31 | def test_nov2010(): 32 | # Bug submitted by Mao Ziyang 33 | # This was failing in 0.3.5 because SDA selected no features 34 | np.random.seed(222) 35 | features = np.random.randn(100,20) 36 | features[:50] *= 2 37 | labels = np.repeat((0,1), 50) 38 | 39 | classifier = milk.defaultclassifier() 40 | model = classifier.train(features, labels) 41 | new_label = model.apply(np.random.randn(20)*2) 42 | new_label2 = model.apply(np.random.randn(20)) 43 | assert new_label == 0 44 | assert new_label2 == 1 45 | 46 | def test_default_small(): 47 | features, labels = load() 48 | selected = np.concatenate( [np.where(labels < 2)[0], np.where(labels == 2)[0][:6]] ) 49 | features = features[selected] 50 | labels = labels[selected] 51 | learner = defaultclassifier('fast') 52 | # For version 0.3.8, the line below led to an error 53 | milk.nfoldcrossvalidation(features, labels, classifier=learner) 54 | 55 | -------------------------------------------------------------------------------- /milk/tests/test_regression_constant_features.py: -------------------------------------------------------------------------------- 1 | import milk 2 | import numpy as np 3 | def test_constant_features(): 4 | learner = milk.defaultclassifier() 5 | features = np.ones(20).reshape((-1,1)) 6 | labels = np.zeros(20) 7 | labels[10:] += 1 8 | features[10:] *= -1 9 | learner.train(features, labels) 10 | 11 | -------------------------------------------------------------------------------- /milk/tests/test_rf.py: -------------------------------------------------------------------------------- 1 | from milk.supervised import randomforest 2 | import numpy as np 3 | 4 | def test_rf(): 5 | from milksets import wine 6 | features, labels = wine.load() 7 | features = features[labels < 2] 8 | labels = labels[labels < 2] 9 | learner = randomforest.rf_learner() 10 | model = learner.train(features[::5], labels[::5]) 11 | test = [model.apply(f) for f in features] 12 | assert np.mean(labels == test) > .7 13 | 14 | -------------------------------------------------------------------------------- /milk/tests/test_set2binary_array.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from milk.supervised import set2binary_array 3 | 4 | def test_set2binary_array_len(): 5 | s2f = set2binary_array.set2binary_array() 6 | inputs = [ np.arange(1,3)*2, np.arange(4)**2, np.arange(6)+2 ] 7 | labels = [0,0,1] 8 | model = s2f.train(inputs,labels) 9 | assert len(model.apply(inputs[0])) == len(model.apply(inputs[1])) 10 | assert len(model.apply(inputs[0])) == len(model.apply(inputs[2])) 11 | assert len(model.apply(inputs[0])) == len(model.apply(list(range(128)))) 12 | 13 | -------------------------------------------------------------------------------- /milk/tests/test_som.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from milk.unsupervised import som 4 | from milk.unsupervised.som import putpoints, closest 5 | 6 | 7 | def _slow_putpoints(grid, points, L=.2): 8 | for point in points: 9 | dpoint = grid-point 10 | y,x = np.unravel_index(np.abs(dpoint).argmin(), dpoint.shape) 11 | for dy in range(-4, +4): 12 | for dx in range(-4, +4): 13 | ny = y + dy 14 | nx = x + dx 15 | if ny < 0 or ny >= grid.shape[0]: 16 | continue 17 | if nx < 0 or nx >= grid.shape[1]: 18 | continue 19 | L2 = L/(1+np.abs(dy)+np.abs(dx)) 20 | grid[ny,nx] *= 1. - L2 21 | grid[ny,nx] += point*L2 22 | 23 | 24 | def data_grid(): 25 | np.random.seed(22) 26 | data = np.arange(100000, dtype=np.float32) 27 | grid = np.array([data.flat[np.random.randint(0, data.size)] for i in range(64*64)]).reshape((64,64,1)) 28 | data = data.reshape((-1,1)) 29 | return grid, data 30 | 31 | def test_putpoints(): 32 | grid, points = data_grid() 33 | points = points[:100] 34 | grid2 = grid.copy() 35 | putpoints(grid, points, L=0., R=1) 36 | assert np.all(grid == grid2) 37 | putpoints(grid, points, L=.5, R=1) 38 | assert not np.all(grid == grid2) 39 | 40 | def test_against_slow(): 41 | grid, points = data_grid() 42 | grid2 = grid.copy() 43 | putpoints(grid, points[:10], shuffle=False) 44 | _slow_putpoints(grid2.reshape((64,64)), points[:10]) 45 | assert np.allclose(grid, grid2) 46 | 47 | 48 | def test_som(): 49 | N = 10000 50 | np.random.seed(2) 51 | data = np.array([np.arange(N), N/4.*np.random.randn(N)]) 52 | data = data.transpose().copy() 53 | grid = som(data, (8,8), iterations=3, R=4) 54 | assert grid.shape == (8,8,2) 55 | y,x = closest(grid, data[0]) 56 | assert 0 <= y < grid.shape[0] 57 | assert 0 <= x < grid.shape[1] 58 | 59 | grid2 = grid.copy() 60 | np.random.shuffle(grid2) 61 | full = np.abs(np.diff(grid2[:,:,0], axis=0)).mean() 62 | obs = np.abs(np.diff(grid[:,:,0], axis=0)).mean() 63 | obs2 = np.abs(np.diff(grid[:,:,0], axis=1)).mean() 64 | assert obs + 4*np.abs(obs-obs2) < full 65 | 66 | -------------------------------------------------------------------------------- /milk/tests/test_svm_sigmoidal.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | from milk.supervised import svm 3 | import numpy 4 | import numpy as np 5 | 6 | def old_learn_sigmoid_constants(F,Y, 7 | max_iters=None, 8 | min_step=1e-10, 9 | sigma=1e-12, 10 | eps=1e-5): 11 | ''' 12 | Old version. Direct C-like implementation 13 | ''' 14 | # the deci[i] array is called F[i] in this code 15 | F = np.asanyarray(F) 16 | Y = np.asanyarray(Y) 17 | assert len(F) == len(Y) 18 | assert numpy.all( (Y == 1) | (Y == 0) ) 19 | from numpy import log, exp 20 | N=len(F) 21 | if max_iters is None: max_iters = 1000 22 | 23 | prior1 = Y.sum() 24 | prior0 = N-prior1 25 | 26 | small_nr = 1e-4 27 | 28 | hi_t = (prior1+1.)/(prior1+2.) 29 | lo_t = 1./(prior0+2.) 30 | 31 | T = Y*hi_t + (1-Y)*lo_t 32 | 33 | A = 0. 34 | B = log( (prior0+1.)/(prior1+1.) ) 35 | def target(A,B): 36 | fval = 0. 37 | for i in range(N): 38 | fApB = F[i]*A+B 39 | if fApB >= 0: 40 | fval += T[i]*fApB+log(1+exp(-fApB)) 41 | else: 42 | fval += (T[i]-1.)*fApB+log(1+exp(fApB)) 43 | return fval 44 | fval = target(A,B) 45 | for iter in range(max_iters): 46 | h11=sigma 47 | h22=sigma 48 | h21=0. 49 | g1=0. 50 | g2=0. 51 | for i in range(N): 52 | fApB = F[i]*A+B 53 | if (fApB >= 0): 54 | p = exp(-fApB)/(1.+exp(-fApB)) 55 | q = 1./(1.+exp(-fApB)) 56 | else: 57 | p = 1./(1.+exp(fApB)) 58 | q = exp(fApB)/(1.+exp(fApB)) 59 | d2 = p * q 60 | h11 += F[i]*F[i]*d2 61 | h22 += d2 62 | h21 += F[i]*d2 63 | d1 = T[i] - p 64 | g1 += F[i]*d1 65 | g2 += d1 66 | if abs(g1) < eps and abs(g2) < eps: # Stopping criteria 67 | break 68 | 69 | det = h11*h22 - h21*h21 70 | dA = - (h22*g1 - h21*g2)/det 71 | dB = - (h21*g1 + h11*g2)/det 72 | gd = g1*dA + g2*dB 73 | stepsize = 1. 74 | 75 | while stepsize >= min_step: 76 | newA = A + stepsize*dA 77 | newB = B + stepsize*dB 78 | newf = target(newA,newB) 79 | if newf < fval+eps*stepsize*gd: 80 | A = newA 81 | B = newB 82 | fval = newf 83 | break 84 | stepsize /= 2 85 | if stepsize < min_step: 86 | break 87 | return A,B 88 | 89 | 90 | def test_learn_sigmoid_contants(): 91 | Y = np.repeat((0,1),100) 92 | np.random.seed(3) 93 | for i in range(10): 94 | F = np.random.rand(200)-.3 95 | F[100:] *= -1 96 | old = old_learn_sigmoid_constants(F,Y) 97 | new = svm.learn_sigmoid_constants(F,Y) 98 | assert np.allclose(old, new) 99 | 100 | -------------------------------------------------------------------------------- /milk/tests/test_tree.py: -------------------------------------------------------------------------------- 1 | import milk.supervised.tree 2 | import milk.supervised._tree 3 | from milk.supervised._tree import set_entropy 4 | from milk.supervised.tree import information_gain, stump_learner 5 | import numpy as np 6 | 7 | def test_tree(): 8 | from milksets import wine 9 | features, labels = wine.load() 10 | selected = (labels < 2) 11 | features = features[selected] 12 | labels = labels[selected] 13 | C = milk.supervised.tree.tree_classifier() 14 | model = C.train(features,labels) 15 | assert (np.array([model.apply(f) for f in features]) == labels).mean() > .5 16 | 17 | 18 | def test_split_subsample(): 19 | import random 20 | from milksets import wine 21 | features, labels = wine.load() 22 | labels = labels.astype(np.int) 23 | 24 | seen = set() 25 | for i in range(20): 26 | random.seed(2) 27 | i,s = milk.supervised.tree._split(features[::10], labels[::10], None, milk.supervised.tree.information_gain, 2, random) 28 | seen.add(i) 29 | assert len(seen) <= 2 30 | 31 | 32 | def test_set_entropy(): 33 | labels = np.arange(101)%3 34 | counts = np.zeros(3) 35 | entropy = milk.supervised._tree.set_entropy(labels, counts) 36 | slow_counts = np.array([(labels == i).sum() for i in range(3)]) 37 | assert np.all(counts == slow_counts) 38 | px = slow_counts.astype(float)/ slow_counts.sum() 39 | slow_entropy = - np.sum(px * np.log(px)) 40 | assert np.abs(slow_entropy - entropy) < 1.e-8 41 | 42 | 43 | def slow_information_gain(labels0, labels1): 44 | H = 0. 45 | N = len(labels0) + len(labels1) 46 | nlabels = 1+max(labels0.max(), labels1.max()) 47 | counts = np.empty(nlabels, np.double) 48 | for arg in (labels0, labels1): 49 | H -= len(arg)/float(N) * set_entropy(arg, counts) 50 | return H 51 | 52 | def test_information_gain(): 53 | np.random.seed(22) 54 | for i in range(8): 55 | labels0 = (np.random.randn(20) > .2).astype(int) 56 | labels1 = (np.random.randn(33) > .8).astype(int) 57 | fast = information_gain(labels0, labels1) 58 | slow = slow_information_gain(labels0, labels1) 59 | assert np.abs(fast - slow) < 1.e-8 60 | 61 | 62 | def test_information_gain_small(): 63 | labels1 = np.array([0]) 64 | labels0 = np.array([0, 1]) 65 | assert information_gain(labels0, labels1) < 0. 66 | 67 | 68 | def test_z1_loss(): 69 | from milk.supervised.tree import z1_loss 70 | L0 = np.zeros(10) 71 | L1 = np.ones(10) 72 | L1[3] = 0 73 | W0 = np.ones(10) 74 | W1 = np.ones(10) 75 | assert z1_loss(L0, L1) == z1_loss(L0, L1, W0, W1) 76 | assert z1_loss(L0, L1) != z1_loss(L0, L1, W0, .8*W1) 77 | assert z1_loss(L0, L1) > 0 78 | 79 | 80 | def test_stump_learner(): 81 | learner = stump_learner() 82 | np.random.seed(111) 83 | for i in range(8): 84 | features = np.random.random_sample((40,2)) 85 | features[:20,0] += .5 86 | labels = np.repeat((0,1),20) 87 | model = learner.train(features, labels, normalisedlabels=True) 88 | assert not model.apply([0.01,.5]) 89 | assert model.apply(np.random.random_sample(2)+.8) 90 | assert model.idx == 0 91 | 92 | -------------------------------------------------------------------------------- /milk/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from milk.utils.utils import get_nprandom, get_pyrandom 3 | 4 | def test_nprandom(): 5 | assert get_nprandom(None).rand() != get_nprandom(None).rand() 6 | assert get_nprandom(1).rand() != get_nprandom(2).rand() 7 | assert get_nprandom(1).rand() == get_nprandom(1).rand() 8 | r = get_nprandom(1) 9 | assert get_nprandom(r).rand() != r.rand() 10 | 11 | def test_pyrandom(): 12 | assert get_pyrandom(None).random() != get_pyrandom(None).random() 13 | assert get_pyrandom(1).random() != get_pyrandom(2).random() 14 | assert get_pyrandom(1).random() == get_pyrandom(1).random() 15 | r = get_pyrandom(1) 16 | assert get_pyrandom(r).random() != r.random() 17 | 18 | def test_cross_random(): 19 | assert get_pyrandom(get_nprandom(1)).random() == get_pyrandom(get_nprandom(1)).random() 20 | assert get_nprandom(get_pyrandom(1)).rand() == get_nprandom(get_pyrandom(1)).rand() 21 | 22 | def test_recursive(): 23 | def recurse(f): 24 | R = f(None) 25 | assert f(R) is R 26 | yield recurse, get_pyrandom 27 | yield recurse, get_nprandom 28 | 29 | -------------------------------------------------------------------------------- /milk/unsupervised/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2013, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # License: MIT. See COPYING.MIT file in the milk distribution 6 | 7 | ''' 8 | milk.unsupervised 9 | 10 | Unsupervised Learning 11 | --------------------- 12 | 13 | - kmeans: This is a highly optimised implementation of kmeans 14 | - PCA: Simple implementation 15 | - Non-negative matrix factorisation: both direct and with sparsity constraints 16 | ''' 17 | 18 | from .kmeans import kmeans,repeated_kmeans, select_best_kmeans 19 | from .gaussianmixture import * 20 | from .pca import pca, mds, mds_dists 21 | from . import nnmf 22 | from .nnmf import * 23 | from .pdist import pdist, plike 24 | from .som import som 25 | from .normalise import zscore, center 26 | 27 | __all__ = [ 28 | 'center', 29 | 'kmeans', 30 | 'mds', 31 | 'mds_dists', 32 | 'pca', 33 | 'pdist', 34 | 'plike', 35 | 'repeated_kmeans', 36 | 'select_best_kmeans', 37 | 'som', 38 | 'zscore', 39 | ] + \ 40 | nnmf.__all__ 41 | -------------------------------------------------------------------------------- /milk/unsupervised/_som.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "../utils/utils.h" 5 | 6 | extern "C" { 7 | #include 8 | #include 9 | } 10 | 11 | namespace { 12 | struct SOM_Exception { 13 | SOM_Exception(const char* msg): msg(msg) { } 14 | const char* msg; 15 | 16 | }; 17 | void assert_type_contiguous(PyArrayObject* array,int type) { 18 | if (!PyArray_Check(array) || 19 | PyArray_TYPE(array) != type || 20 | !PyArray_ISCONTIGUOUS(array)) { 21 | throw SOM_Exception("Arguments to putpoints don't conform to expectation. Are you calling this directly? This is an internal function!"); 22 | } 23 | } 24 | 25 | void putpoints(PyArrayObject* grid, PyArrayObject* points, float L, int radius) { 26 | if (PyArray_NDIM(grid) != 3) throw SOM_Exception("grid should be three dimensional"); 27 | if (PyArray_NDIM(points) != 2) throw SOM_Exception("points should be two dimensional"); 28 | const int rows = PyArray_DIM(grid, 0); 29 | const int cols = PyArray_DIM(grid, 1); 30 | const int d = PyArray_DIM(grid, 2); 31 | const int n = PyArray_DIM(points, 0); 32 | if (PyArray_DIM(points, 1) != d) throw SOM_Exception("second dimension of points is not third dimension of grid"); 33 | 34 | Py_BEGIN_ALLOW_THREADS 35 | 36 | for (int i = 0; i != n; i++){ 37 | const float* p = static_cast(PyArray_GETPTR1(points, i)); 38 | int min_y = 0; 39 | int min_x = 0; 40 | float best = std::numeric_limits::max(); 41 | for (int y = 0; y != rows; ++y) { 42 | for (int x = 0; x != cols; ++x) { 43 | float dist = 0.; 44 | const float* gpoint = static_cast(PyArray_GETPTR2(grid, y, x)); 45 | for (int j = 0; j != d; ++j) { 46 | dist += (p[j] - gpoint[j])*(p[j] - gpoint[j]); 47 | } 48 | if (dist < best) { 49 | best = dist; 50 | min_y = y; 51 | min_x = x; 52 | } 53 | } 54 | } 55 | const int start_y = std::max(0, min_y - radius); 56 | const int start_x = std::max(0, min_x - radius); 57 | const int end_y = std::min(rows, min_y + radius); 58 | const int end_x = std::min(rows, min_x + radius); 59 | 60 | for (int y = start_y; y != end_y; ++y) { 61 | for (int x = start_x; x != end_x; ++x) { 62 | const float L2 = L /(1 + std::abs(min_y - y) + std::abs(min_x - x)); 63 | float* gpoint = static_cast(PyArray_GETPTR2(grid, y, x)); 64 | for (int j = 0; j != d; ++j) { 65 | gpoint[j] *= (1.-L2); 66 | gpoint[j] += L2 * p[j]; 67 | } 68 | } 69 | } 70 | } 71 | Py_END_ALLOW_THREADS 72 | } 73 | 74 | 75 | PyObject* py_putpoints(PyObject* self, PyObject* args) { 76 | try { 77 | PyArrayObject* grid; 78 | PyArrayObject* points; 79 | float L; 80 | int radius; 81 | if (!PyArg_ParseTuple(args, "OOfi", &grid, &points, &L, &radius)) { 82 | const char* errmsg = "Arguments were not what was expected for putpoints.\n" 83 | "This is an internal function: Do not call directly unless you know exactly what you're doing.\n"; 84 | PyErr_SetString(PyExc_RuntimeError,errmsg); 85 | return 0; 86 | } 87 | assert_type_contiguous(grid, NPY_FLOAT); 88 | assert_type_contiguous(points, NPY_FLOAT); 89 | putpoints(grid, points, L, radius); 90 | 91 | Py_RETURN_NONE; 92 | } catch (const SOM_Exception& exc) { 93 | PyErr_SetString(PyExc_RuntimeError,exc.msg); 94 | return 0; 95 | } catch (...) { 96 | PyErr_SetString(PyExc_RuntimeError,"Some sort of exception in putpoints."); 97 | return 0; 98 | } 99 | } 100 | 101 | PyMethodDef methods[] = { 102 | {"putpoints", py_putpoints, METH_VARARGS , "Do NOT call directly.\n" }, 103 | {NULL, NULL,0,NULL}, 104 | }; 105 | 106 | const char * module_doc = 107 | "Internal SOM Module.\n" 108 | "\n" 109 | "Do NOT use directly!\n"; 110 | 111 | } // namespace 112 | 113 | 114 | DECLARE_MODULE(_som) 115 | 116 | -------------------------------------------------------------------------------- /milk/unsupervised/affinity.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 3 | # Copyright (C) 2010-2011, 4 | # Luis Pedro Coelho , 5 | # Alexandre Gramfort , 6 | # Gael Varoquaux 7 | # 8 | # License: MIT. See COPYING.MIT file in the milk distribution 9 | """Affinity propagation 10 | 11 | Original Authors (for scikits.learn): 12 | Alexandre Gramfort alexandre.gramfort@inria.fr 13 | Gael Varoquaux gael.varoquaux@normalesup.org 14 | 15 | Luis Pedro Coelho made the implementation more careful about allocating 16 | intermediate arrays. 17 | """ 18 | 19 | import numpy as np 20 | 21 | __all__ = [ 22 | 'affinity_propagation', 23 | ] 24 | 25 | def affinity_propagation(S, p=None, convit=30, maxit=200, damping=0.5, copy=True, R=0): 26 | """Perform Affinity Propagation Clustering of data 27 | 28 | Parameters 29 | ---------- 30 | S : array [n_points, n_points] 31 | Matrix of similarities between points 32 | p : array [n_points,] or float, optional 33 | Preferences for each point 34 | damping : float, optional 35 | Damping factor 36 | copy : boolean, optional 37 | If copy is False, the affinity matrix is modified inplace by the 38 | algorithm, for memory efficiency 39 | R : source of randomness 40 | 41 | Returns 42 | ------- 43 | 44 | cluster_centers_indices : array [n_clusters] 45 | index of clusters centers 46 | 47 | labels : array [n_points] 48 | cluster labels for each point 49 | 50 | Notes 51 | ----- 52 | See examples/plot_affinity_propagation.py for an example. 53 | 54 | Reference: 55 | Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages 56 | Between Data Points", Science Feb. 2007 57 | 58 | """ 59 | if copy: 60 | # Copy the affinity matrix to avoid modifying it inplace 61 | S = S.copy() 62 | 63 | n_points = S.shape[0] 64 | 65 | assert S.shape[0] == S.shape[1] 66 | 67 | if p is None: 68 | p = np.median(S) 69 | 70 | if damping < 0.5 or damping >= 1: 71 | raise ValueError('damping must be >= 0.5 and < 1') 72 | 73 | random_state = np.random.RandomState(R) 74 | 75 | # Place preferences on the diagonal of S 76 | S.flat[::(n_points+1)] = p 77 | 78 | A = np.zeros((n_points, n_points)) 79 | R = np.zeros((n_points, n_points)) # Initialize messages 80 | 81 | # Remove degeneracies 82 | noise = random_state.randn(n_points, n_points) 83 | typeinfo = np.finfo(S.dtype) 84 | noise *= typeinfo.tiny*100 85 | S += noise 86 | del noise 87 | 88 | # Execute parallel affinity propagation updates 89 | e = np.zeros((n_points, convit)) 90 | 91 | ind = np.arange(n_points) 92 | 93 | for it in range(maxit): 94 | Aold = A.copy() 95 | Rold = R.copy() 96 | A += S 97 | 98 | I = np.argmax(A, axis=1) 99 | Y = A[ind, I]#np.max(A, axis=1) 100 | 101 | A[ind, I] = typeinfo.min 102 | 103 | Y2 = np.max(A, axis=1) 104 | R = S - Y[:, np.newaxis] 105 | 106 | R[ind, I[ind]] = S[ind, I] - Y2 107 | 108 | Rold *= damping 109 | R *= (1-damping) 110 | R += Rold 111 | 112 | # Compute availabilities 113 | Rd = R.diagonal().copy() 114 | np.maximum(R, 0, R) 115 | R.flat[::n_points+1] = Rd 116 | 117 | A = np.sum(R, axis=0)[np.newaxis, :] - R 118 | 119 | dA = np.diag(A) 120 | A = np.minimum(A, 0) 121 | 122 | A.flat[::n_points+1] = dA 123 | 124 | Aold *= damping 125 | A *= (1-damping) 126 | A += Aold 127 | 128 | # Check for convergence 129 | E = (np.diag(A) + np.diag(R)) > 0 130 | e[:, it % convit] = E 131 | K = np.sum(E, axis=0) 132 | 133 | if it >= convit: 134 | se = np.sum(e, axis=1); 135 | unconverged = np.sum((se == convit) + (se == 0)) != n_points 136 | if (not unconverged and (K>0)) or (it==maxit): 137 | print("Converged after %d iterations." % it) 138 | break 139 | else: 140 | print("Did not converge") 141 | 142 | I = np.where(np.diag(A+R) > 0)[0] 143 | K = I.size # Identify exemplars 144 | 145 | if K > 0: 146 | c = np.argmax(S[:, I], axis=1) 147 | c[I] = np.arange(K) # Identify clusters 148 | # Refine the final set of exemplars and clusters and return results 149 | for k in range(K): 150 | ii = np.where(c==k)[0] 151 | j = np.argmax(np.sum(S[ii, ii], axis=0)) 152 | I[k] = ii[j] 153 | 154 | c = np.argmax(S[:, I], axis=1) 155 | c[I] = np.arange(K) 156 | labels = I[c] 157 | # Reduce labels to a sorted, gapless, list 158 | cluster_centers_indices = np.unique(labels) 159 | labels = np.searchsorted(cluster_centers_indices, labels) 160 | else: 161 | labels = np.empty((n_points, 1)) 162 | cluster_centers_indices = None 163 | labels.fill(np.nan) 164 | 165 | return cluster_centers_indices, labels 166 | -------------------------------------------------------------------------------- /milk/unsupervised/nnmf/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2010, Luis Pedro Coelho 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | 22 | from . import lee_seung as ls 23 | from .hoyer import sparse_nnmf 24 | from . import hoyer 25 | lee_seung = ls.nnmf 26 | 27 | __all__ = ['lee_seung','sparse_nnmf'] 28 | 29 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 30 | -------------------------------------------------------------------------------- /milk/unsupervised/nnmf/hoyer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2010, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | 24 | import numpy as np 25 | from ...utils import get_nprandom 26 | 27 | __all__ = ['hoyer_sparse_nnmf'] 28 | 29 | 30 | def sp(s): 31 | L2 = np.sqrt(np.dot(s,s)) 32 | L1 = np.abs(s).sum() 33 | sn = np.sqrt(len(s)) 34 | return (sn-L1/L2)/(sn-1) 35 | 36 | def _solve_alpha(s,m,L2): 37 | sm = s-m 38 | s2 = np.dot(s,s) 39 | sm2 = np.dot(sm, sm) 40 | m2 = np.dot(m, m) 41 | dot = np.dot(m, sm) 42 | alpha = (-dot + np.sqrt(dot**2 - sm2*(m2-L2**2)))/sm2 43 | return alpha 44 | 45 | def _project(x,L1,L2): 46 | ''' 47 | Implement projection onto sparse space 48 | ''' 49 | x = np.asanyarray(x) 50 | n = len(x) 51 | 52 | s = x + (L1 - x.sum())/n 53 | Z = np.zeros(n,bool) 54 | while True: 55 | m = (~Z) * L1/(n-Z.sum()) 56 | alpha = _solve_alpha(s,m,L2) 57 | s = m + alpha * (s - m) 58 | negs = (s < 0) 59 | if not negs.any(): 60 | return s 61 | Z |= negs 62 | s[Z] = 0 63 | c = (s.sum() - L1)/(~Z).sum() 64 | s -= c*(~Z) 65 | 66 | def _L1for(s,x,L2): 67 | ''' 68 | Solve for L1 in 69 | 70 | s = [ sqrt(n) - L1/L2] / [sqrt(n) - 1] 71 | ''' 72 | sn = np.sqrt(len(x)) 73 | return L2*(s+sn-s*sn) 74 | 75 | def sparse_nnmf(V, r, sparsenessW=None, sparsenessH=None, max_iter=10000, R=None): 76 | ''' 77 | W,H = hoyer.sparse_nnmf(V, r, sparsenessW = None, sparsenessH = None, max_iter=10000, R=None) 78 | 79 | Implement sparse nonnegative matrix factorisation. 80 | 81 | Parameters 82 | ---------- 83 | V : 2-D matrix 84 | input feature matrix 85 | r : integer 86 | number of latent features 87 | sparsenessW : double, optional 88 | sparseness contraint on W (default: no sparsity contraint) 89 | sparsenessH : double, optional 90 | sparseness contraint on H (default: no sparsity contraint) 91 | max_iter : integer, optional 92 | maximum nr of iterations (default: 10000) 93 | R : integer, optional 94 | source of randomness 95 | 96 | Returns 97 | ------- 98 | W : 2-ndarray 99 | H : 2-ndarray 100 | 101 | Reference 102 | --------- 103 | "Non-negative Matrix Factorisation with Sparseness Constraints" 104 | by Patrik Hoyer 105 | in Journal of Machine Learning Research 5 (2004) 1457--1469 106 | ''' 107 | 108 | n,m = V.shape 109 | R = get_nprandom(R) 110 | mu_W = .15 111 | mu_H = .15 112 | eps = 1e-8 113 | W = R.standard_normal((n,r))**2 114 | H = R.standard_normal((r,m))**2 115 | 116 | def fix(X, sparseness): 117 | for i in range(r): 118 | row = X[i] 119 | L2 = np.sqrt(np.dot(row, row)) 120 | row /= L2 121 | X[i] = _project(row, _L1for(sparseness, row, 1.), 1.) 122 | 123 | def fixW(): 124 | fix(W.T, sparsenessW) 125 | def fixH(): 126 | fix(H, sparsenessH) 127 | 128 | if sparsenessW is not None: fixW() 129 | if sparsenessH is not None: fixH() 130 | for i in range(max_iter): 131 | if sparsenessW is not None: 132 | W -= mu_W * np.dot(np.dot(W,H)-V,H.T) 133 | fixW() 134 | else: 135 | updateW = np.dot(V,H.T)/(np.dot(W,np.dot(H,H.T))+eps) 136 | W *= updateW 137 | if sparsenessH is not None: 138 | H -= mu_H * np.dot(W.T,np.dot(W,H)-V) 139 | fixH() 140 | else: 141 | updateH = np.dot(W.T,V)/(np.dot(np.dot(W.T,W),H)+eps) 142 | H *= updateH 143 | return W,H 144 | 145 | -------------------------------------------------------------------------------- /milk/unsupervised/nnmf/lee_seung.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2010, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | from __future__ import division 24 | import numpy as np 25 | from numpy import dot 26 | from ...utils import get_nprandom 27 | 28 | __all__ = ['nnmf'] 29 | 30 | def nnmf(V, r, cost='norm2', max_iter=int(1e4), tol=1e-8, R=None): 31 | ''' 32 | A,S = nnmf(X, r, cost='norm2', tol=1e-8, R=None) 33 | 34 | Implement Lee & Seung's algorithm 35 | 36 | Parameters 37 | ---------- 38 | V : 2-ndarray 39 | input matrix 40 | r : integer 41 | nr of latent features 42 | cost : one of: 43 | 'norm2' : minimise || X - AS ||_2 (default) 44 | 'i-div' : minimise D(X||AS), where D is I-divergence (generalisation of K-L divergence) 45 | max_iter : integer, optional 46 | maximum number of iterations (default: 10000) 47 | tol : double 48 | tolerance threshold for early exit (when the update factor is with tol 49 | of 1., the function exits) 50 | R : integer, optional 51 | random seed 52 | 53 | Returns 54 | ------- 55 | A : 2-ndarray 56 | S : 2-ndarray 57 | 58 | Reference 59 | --------- 60 | "Algorithms for Non-negative Matrix Factorization" 61 | by Daniel D Lee, Sebastian H Seung 62 | (available at http://citeseer.ist.psu.edu/lee01algorithms.html) 63 | ''' 64 | # Nomenclature in the function follows lee & seung, while outside nomenclature follows 65 | eps = 1e-8 66 | n,m = V.shape 67 | R = get_nprandom(R) 68 | W = R.standard_normal((n,r))**2 69 | H = R.standard_normal((r,m))**2 70 | for i in range(max_iter): 71 | if cost == 'norm2': 72 | updateH = dot(W.T,V)/(dot(dot(W.T,W),H)+eps) 73 | H *= updateH 74 | updateW = dot(V,H.T)/(dot(W,dot(H,H.T))+eps) 75 | W *= updateW 76 | elif cost == 'i-div': 77 | raise NotImplementedError('I-Div not implemented in lee_seung.nnmf') 78 | if True or (i % 10) == 0: 79 | max_update = max(updateW.max(),updateH.max()) 80 | if abs(1.-max_update) < tol: 81 | break 82 | return W,H 83 | 84 | -------------------------------------------------------------------------------- /milk/unsupervised/normalise.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2013, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | from __future__ import division 24 | import numpy as np 25 | __all__ = [ 26 | 'center', 27 | 'zscore', 28 | ] 29 | def _nanmean(arr, axis=None): 30 | nancounts = np.sum(~np.isnan(arr), axis=axis) 31 | return np.nansum(arr,axis=axis)/nancounts 32 | def _nanstd(arr, axis=None): 33 | if axis == 1: 34 | return _nanstd(arr.T, axis=0) 35 | mu = _nanmean(arr,axis=axis) 36 | return np.sqrt(_nanmean((arr-mu)**2, axis=axis)) 37 | 38 | 39 | def zscore(features, axis=0, can_have_nans=True, inplace=False): 40 | """ 41 | features = zscore(features, axis=0, can_have_nans=True, inplace=False) 42 | 43 | Returns a copy of features which has been normalised to zscores 44 | 45 | Parameters 46 | ---------- 47 | features : ndarray 48 | 2-D input array 49 | axis : integer, optional 50 | which axis to normalise (default: 0) 51 | can_have_nans : boolean, optional 52 | whether ``features`` is allowed to have NaNs (default: True) 53 | inplace : boolean, optional 54 | Whether to operate inline (i.e., potentially change the input array). 55 | Default is False 56 | 57 | Returns 58 | ------- 59 | features : ndarray 60 | zscored version of features 61 | """ 62 | if not inplace: 63 | features = features.copy() 64 | else: 65 | features = np.asarray(features) 66 | if features.ndim != 2: 67 | raise ValueError('milk.unsupervised.zscore: Can only handle 2-D arrays') 68 | if can_have_nans: 69 | mu = _nanmean(features, axis) 70 | sigma = _nanstd(features, axis) 71 | else: 72 | mu = features.mean(axis) 73 | sigma = np.std(features, axis) 74 | sigma[sigma == 0] = 1. 75 | if axis == 0: 76 | features -= mu 77 | features /= sigma 78 | elif axis == 1: 79 | features -= mu[:,None] 80 | features /= sigma[:,None] 81 | return features 82 | 83 | 84 | 85 | def center(features, axis=0, can_have_nans=True, inplace=False): 86 | ''' 87 | centered, mean = center(features, axis=0, inplace=False) 88 | 89 | Center data 90 | 91 | Parameters 92 | ---------- 93 | features : ndarray 94 | 2-D input array 95 | axis : integer, optional 96 | which axis to normalise (default: 0) 97 | can_have_nans : boolean, optional 98 | whether ``features`` is allowed to have NaNs (default: True) 99 | inplace : boolean, optional 100 | Whether to operate inline (i.e., potentially change the input array). 101 | Default is False 102 | 103 | Returns 104 | ------- 105 | features : ndarray 106 | centered version of features 107 | mean : ndarray 108 | mean values 109 | ''' 110 | if can_have_nans: 111 | meanfunction = _nanmean 112 | else: 113 | meanfunction = np.mean 114 | features = np.array(features, copy=(not inplace), dtype=float) 115 | mean = meanfunction(features, axis=axis) 116 | if axis == 0: 117 | features -= mean 118 | elif axis == 1: 119 | features -= mean[:,None] 120 | else: 121 | raise ValueError('milk.unsupervised.center: axis ∉ { 0, 1}') 122 | return features, mean 123 | 124 | -------------------------------------------------------------------------------- /milk/unsupervised/parzen.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2010, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | from __future__ import division 24 | import numpy as np 25 | 26 | def get_parzen_1class_rbf_loocv(features): 27 | ''' 28 | f,fprime = get_parzen_1class_rbf_loocv(features) 29 | 30 | Leave-one-out crossvalidation value for 1-class Parzen window evaluator for 31 | features. 32 | 33 | Parameters 34 | ---------- 35 | features : ndarray 36 | feature matrix 37 | 38 | Returns 39 | ------- 40 | f : function: double -> double 41 | function which evaluates the value of a window value. Minize to get the 42 | best window value. 43 | fprime : function: double -> double 44 | function: df/dh 45 | ''' 46 | from milk.unsupervised.pdist import pdist 47 | D2 = -pdist(features) 48 | n = len(features) 49 | sumD2 = D2.sum() 50 | D2.flat[::(n+1)] = -np.inf 51 | def f(h): 52 | D2h = D2 / (2.*h) 53 | np.exp(D2h, D2h) 54 | val = D2h.sum() 55 | return val/np.sqrt(2*h*np.pi) 56 | def fprime(h): 57 | D2h = D2 / (2.*h) 58 | D2h.flat[::(n+1)] = 1. 59 | D2h *= np.exp(D2h) 60 | val = D2h.sum() - D2h.trace() 61 | val /= np.sqrt(2*h*np.pi) 62 | return -1./(4*np.pi*h)*f(h) + val 63 | return f,fprime 64 | 65 | def parzen(features, h): 66 | ''' 67 | f = parzen(features, h) 68 | 69 | Parzen window smoothing 70 | 71 | Parameters 72 | ---------- 73 | features : ndarray 74 | feature matrix 75 | h : double 76 | bandwidth 77 | 78 | Returns 79 | ------- 80 | f : callable (double^N -> double) 81 | density function 82 | ''' 83 | sum2 = np.array([np.dot(f,f) for f in features]) 84 | N = len(features) 85 | beta = np.sqrt(2*h*np.pi)/N 86 | def f(x): 87 | dist = np.dot(features, -2*x) 88 | dist += sum2 89 | dist += np.dot(c,c) 90 | dist /= 2.*h 91 | np.exp(dist, dist) 92 | val = dist.sum() 93 | return val*beta 94 | return f 95 | 96 | -------------------------------------------------------------------------------- /milk/unsupervised/pca.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2013, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # License: MIT. See COPYING.MIT file in the milk distribution 6 | 7 | from __future__ import division 8 | import numpy as np 9 | from numpy import linalg 10 | from . import normalise 11 | from .pdist import pdist 12 | 13 | __all__ = [ 14 | 'pca', 15 | 'mds', 16 | 'mds_dists', 17 | ] 18 | 19 | def pca(X, zscore=True): 20 | ''' 21 | Y,V = pca(X, zscore=True) 22 | 23 | Principal Component Analysis 24 | 25 | Performs principal component analysis. Returns transformed 26 | matrix and principal components 27 | 28 | Parameters 29 | ---------- 30 | X : 2-dimensional ndarray 31 | data matrix 32 | zscore : boolean, optional 33 | whether to normalise to zscores (default: True) 34 | 35 | Returns 36 | ------- 37 | Y : ndarray 38 | Transformed matrix (of same dimension as X) 39 | V : ndarray 40 | principal components 41 | ''' 42 | if zscore: 43 | X = normalise.zscore(X) 44 | C = np.cov(X.T) 45 | w,v = linalg.eig(C) 46 | Y = np.dot(v,X.T).T 47 | return Y,v 48 | 49 | 50 | def mds(features, ndims, zscore=False): 51 | ''' 52 | X = mds(features, ndims, zscore=False) 53 | 54 | Euclidean Multi-dimensional Scaling 55 | 56 | Parameters 57 | ---------- 58 | features : ndarray 59 | data matrix 60 | ndims : int 61 | Number of dimensions to return 62 | zscore : boolean, optional 63 | Whether to zscore the features (default: False) 64 | 65 | Returns 66 | ------- 67 | X : ndarray 68 | array of size ``(m, ndims)`` where ``m = len(features)`` 69 | 70 | See Also 71 | -------- 72 | mds_dists : function 73 | ''' 74 | if zscore: 75 | features = normalise.zscore(features) 76 | else: 77 | features = np.asarray(features) 78 | P2 = pdist(features) 79 | return mds_dists(P2, ndims) 80 | 81 | def mds_dists(distances, ndims): 82 | ''' 83 | X = mds_dists(distances, ndims) 84 | 85 | Euclidean Multi-dimensional Scaling based on a distance matrix 86 | 87 | Parameters 88 | ---------- 89 | distances : ndarray 90 | data matrix 91 | ndims : int 92 | Number of dimensions to return 93 | 94 | Returns 95 | ------- 96 | X : ndarray 97 | array of size ``(m, ndims)`` where ``m = len(features)`` 98 | 99 | See Also 100 | -------- 101 | mds : function 102 | ''' 103 | 104 | n = len(distances) 105 | J = np.eye(n) - (1./n)* np.ones((n,n)) 106 | B = -.5 * np.dot(J,np.dot(distances,J)) 107 | w,v = np.linalg.eig(B) 108 | worder = w.argsort() 109 | worder = worder[::-1] 110 | w = w[worder] 111 | v = v[:,worder] 112 | 113 | 114 | w = w[:ndims] 115 | s = np.sign(w) 116 | w = np.abs(w).real 117 | w = np.diag(np.sqrt(s * w)) 118 | X = np.dot(v[:,:ndims], w) 119 | return X.real 120 | 121 | -------------------------------------------------------------------------------- /milk/unsupervised/pdist.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2012, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # License: MIT. See COPYING.MIT file in the milk distribution 6 | 7 | from __future__ import division 8 | import numpy as np 9 | 10 | __all__ = [ 11 | 'pdist', 12 | 'plike', 13 | ] 14 | 15 | def pdist(X, Y=None, distance='euclidean2'): 16 | ''' 17 | D = pdist(X, Y={X}, distance='euclidean2') 18 | 19 | Compute distance matrix:: 20 | 21 | D[i,j] == np.sum( (X[i] - Y[j])**2 ) 22 | 23 | Parameters 24 | ---------- 25 | X : feature matrix 26 | Y : feature matrix (default: use `X`) 27 | distance : one of 'euclidean' or 'euclidean2' (default) 28 | 29 | Returns 30 | ------- 31 | D : matrix of doubles 32 | ''' 33 | # Use Dij = np.dot(Xi, Xi) + np.dot(Xj,Xj) - 2.*np.dot(Xi,Xj) 34 | if Y is None: 35 | D = np.dot(X, X.T) 36 | x2 = D.diagonal() 37 | x2 = x2.copy() 38 | y2 = x2 39 | else: 40 | D = np.dot(X, Y.T) 41 | x2 = np.array([np.dot(x,x) for x in X]) 42 | y2 = np.array([np.dot(y,y) for y in Y]) 43 | D *= -2. 44 | D += x2[:,np.newaxis] 45 | D += y2 46 | 47 | # Because of numerical imprecision, we might get negative numbers 48 | # (which cause problems down the road, e.g., when doing the sqrt): 49 | np.maximum(D, 0, D) 50 | if distance == 'euclidean': 51 | np.sqrt(D, D) 52 | return D 53 | 54 | 55 | def plike(X, sigma2=None): 56 | ''' 57 | L = plike(X, sigma2={guess based on X}) 58 | 59 | Compute likelihood that any two objects come from the same distribution 60 | under a Gaussian distribution hypothesis:: 61 | 62 | L[i,j] = exp( ||X[i] - X[j]||^2 / sigma2 ) 63 | 64 | Parameters 65 | ---------- 66 | X : ndarray 67 | feature matrix 68 | sigma2 : float, optional 69 | bandwidth 70 | 71 | Returns 72 | ------- 73 | L : ndarray 74 | likelihood matrix 75 | 76 | See Also 77 | -------- 78 | pdist : function 79 | Compute distances between objects 80 | ''' 81 | 82 | L = pdist(X) 83 | if sigma2 is None: 84 | sigma2 = np.median(L) 85 | L /= -sigma2 86 | np.exp(L, L) 87 | return L 88 | -------------------------------------------------------------------------------- /milk/unsupervised/som.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2010, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # License: MIT. See COPYING.MIT file in the milk distribution 5 | from __future__ import division 6 | 7 | import numpy as np 8 | 9 | from ..utils import get_pyrandom 10 | from . import _som 11 | 12 | def putpoints(grid, points, L=.2, radius=4, iterations=1, shuffle=True, R=None): 13 | ''' 14 | putpoints(grid, points, L=.2, radius=4, iterations=1, shuffle=True, R=None) 15 | 16 | Feeds elements of `points` into the SOM `grid` 17 | 18 | Parameters 19 | ---------- 20 | grid : ndarray 21 | Self organising map 22 | points : ndarray 23 | data to feed to array 24 | L : float, optional 25 | How much to influence neighbouring points (default: .2) 26 | radius : integer, optional 27 | Maximum radius of influence (in L_1 distance, default: 4) 28 | iterations : integer, optional 29 | Number of iterations 30 | shuffle : boolean, optional 31 | Whether to shuffle the points before each iterations 32 | R : source of randomness 33 | ''' 34 | if radius is None: 35 | radius = 4 36 | if type(L) != float: 37 | raise TypeError("milk.unsupervised.som: L should be floating point") 38 | if type(radius) != int: 39 | raise TypeError("milk.unsupervised.som: radius should be an integer") 40 | if grid.dtype != np.float32: 41 | raise TypeError('milk.unsupervised.som: only float32 arrays are accepted') 42 | if points.dtype != np.float32: 43 | raise TypeError('milk.unsupervised.som: only float32 arrays are accepted') 44 | if len(grid.shape) == 2: 45 | grid = grid.reshape(grid.shape+(1,)) 46 | if shuffle: 47 | random = get_pyrandom(R) 48 | for i in range(iterations): 49 | if shuffle: 50 | random.shuffle(points) 51 | _som.putpoints(grid, points, L, radius) 52 | 53 | def closest(grid, f): 54 | ''' 55 | y,x = closest(grid, f) 56 | 57 | Finds the coordinates of the closest point in the `grid` to `f` 58 | 59 | :: 60 | 61 | y,x = \\argmin_{y,x} { || grid[y,x] - f ||^2 } 62 | 63 | Parameters 64 | ---------- 65 | grid : ndarray of shape Y,X,J 66 | self-organised map 67 | f : ndarray of shape J 68 | point 69 | 70 | Returns 71 | ------- 72 | y,x : integers 73 | coordinates into `grid` 74 | ''' 75 | delta = grid - f 76 | delta **= 2 77 | delta = delta.sum(2) 78 | return np.unravel_index(delta.argmin(), delta.shape) 79 | 80 | 81 | def som(data, shape, iterations=1000, L=.2, radius=4, R=None): 82 | ''' 83 | grid = som(data, shape, iterations=1000, L=.2, radius=4, R=None): 84 | 85 | Self-organising maps 86 | 87 | Parameters 88 | ---------- 89 | points : ndarray 90 | data to feed to array 91 | shape : tuple 92 | Desired shape of output. Must be 2-dimensional. 93 | L : float, optional 94 | How much to influence neighbouring points (default: .2) 95 | radius : integer, optional 96 | Maximum radius of influence (in L_1 distance, default: 4) 97 | iterations : integer, optional 98 | Number of iterations 99 | R : source of randomness 100 | 101 | Returns 102 | ------- 103 | grid : ndarray 104 | Map 105 | ''' 106 | R = get_pyrandom(R) 107 | d = data.shape[1] 108 | if data.dtype != np.float32: 109 | data = data.astype(np.float32) 110 | grid = np.array(R.sample(list(data), np.product(shape))).reshape(shape + (d,)) 111 | putpoints(grid, data, L=L, radius=radius, iterations=iterations, shuffle=True, R=R) 112 | return grid 113 | -------------------------------------------------------------------------------- /milk/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * 2 | -------------------------------------------------------------------------------- /milk/utils/parallel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2011-2012, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # License: MIT. See COPYING.MIT file in the milk distribution 5 | 6 | from __future__ import division, with_statement 7 | import multiprocessing 8 | 9 | max_procs = 1 10 | _used_procs = multiprocessing.Value('i', 1) 11 | _plock = multiprocessing.Lock() 12 | 13 | def set_max_processors(value=None): 14 | ''' 15 | set_max_processors(value=None) 16 | 17 | Set the maximum number of processors to ``value`` (or to the number of 18 | physical CPUs if ``None``). 19 | 20 | Note that this is valid for the current process and its children, but not 21 | the parent. 22 | 23 | Parameters 24 | ---------- 25 | value : int, optional 26 | Number of processors to use. Defaults to number of CPUs (as returned by 27 | ``multiprocessing.cpu_count()``). 28 | ''' 29 | global max_procs 30 | if value is None: 31 | value = multiprocessing.cpu_count() 32 | max_procs = value 33 | 34 | def get_proc(): 35 | ''' 36 | available = get_proc() 37 | 38 | Reserve a processor 39 | 40 | Returns 41 | ------- 42 | available : bool 43 | True if a processor is available 44 | ''' 45 | with _plock: 46 | if _used_procs.value >= max_procs: 47 | return False 48 | _used_procs.value += 1 49 | return True 50 | 51 | def release_proc(): 52 | ''' 53 | release_proc() 54 | 55 | Returns a processor to the pool 56 | ''' 57 | with _plock: 58 | _used_procs.value -= 1 59 | 60 | def release_procs(n, count_current=True): 61 | ''' 62 | release_procs(n, count_current=True) 63 | 64 | Returns ``n`` processors to the pool 65 | 66 | Parameters 67 | ---------- 68 | n : int 69 | Number of processors to release 70 | count_current : bool, optional 71 | Whether the current processor is to be included in ``n`` (default: True) 72 | ''' 73 | if count_current: 74 | n -= 1 75 | if n > 0: 76 | with _plock: 77 | _used_procs.value -= n 78 | 79 | def get_procs(desired=None, use_current=True): 80 | ''' 81 | n = get_procs(desired=None, use_current=True) 82 | 83 | Get the up to ``desired`` processors (use None for no maximum). 84 | 85 | Parameters 86 | ---------- 87 | desired : int, optional 88 | Number of processors you wish. By default, there is no maximum 89 | use_current: bool, optional 90 | Whether to count the current processor, True by default. 91 | ''' 92 | if desired is None: 93 | desired = 1024 # This should last a few years 94 | n = (1 if use_current else 0) 95 | while n < desired: 96 | if get_proc(): 97 | n += 1 98 | else: 99 | return n 100 | return n 101 | 102 | -------------------------------------------------------------------------------- /milk/utils/utils.h: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | #include 3 | #include 4 | } 5 | #if PY_MAJOR_VERSION < 3 6 | 7 | #define DECLARE_MODULE(name) \ 8 | extern "C" \ 9 | void init##name () { \ 10 | import_array(); \ 11 | (void)Py_InitModule(#name, methods); \ 12 | } 13 | 14 | #else 15 | 16 | #define DECLARE_MODULE(name) \ 17 | namespace { \ 18 | struct PyModuleDef moduledef = { \ 19 | PyModuleDef_HEAD_INIT, \ 20 | #name, \ 21 | NULL, \ 22 | -1, \ 23 | methods, \ 24 | NULL, \ 25 | NULL, \ 26 | NULL, \ 27 | NULL \ 28 | }; \ 29 | } \ 30 | PyMODINIT_FUNC \ 31 | PyInit_##name () { \ 32 | import_array(); \ 33 | PyModule_Create(&moduledef); \ 34 | } 35 | 36 | #endif 37 | 38 | -------------------------------------------------------------------------------- /milk/utils/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2012, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | 24 | import numpy as np 25 | import random 26 | 27 | __all__ = [ 28 | 'get_nprandom', 29 | 'get_pyrandom', 30 | ] 31 | 32 | def get_nprandom(R): 33 | ''' 34 | R' = get_nprandom(R) 35 | 36 | Returns a numpy.RandomState from R 37 | 38 | Parameters 39 | ---------- 40 | R : can be one of: 41 | None : Returns the default numpy global state 42 | integer : Uses it as a seed for constructing a new random generator 43 | RandomState : returns R 44 | 45 | Returns 46 | ------- 47 | R' : np.RandomState 48 | ''' 49 | if R is None: 50 | return np.random.mtrand._rand 51 | if type(R) == int: 52 | return np.random.RandomState(R) 53 | if type(R) is random.Random: 54 | return np.random.RandomState(R.randint(0, 2**30)) 55 | if type(R) is np.random.RandomState: 56 | return R 57 | raise TypeError("get_nprandom() does not know how to handle type {0}.".format(type(R))) 58 | 59 | def get_pyrandom(R): 60 | ''' 61 | R = get_pyrandom(R) 62 | 63 | Returns a random.Random object based on R 64 | 65 | Parameters 66 | ---------- 67 | R : can be one of: 68 | None : Returns the default numpy global state 69 | integer : Uses it as a seed for constructing a new random generator 70 | RandomState : returns R 71 | 72 | Returns 73 | ------- 74 | R' : random.Random 75 | ''' 76 | if R is None: 77 | return random.seed.__self__ 78 | if type(R) is int: 79 | return random.Random(R) 80 | if type(R) is np.random.RandomState: 81 | return random.Random(R.randint(2**30)) 82 | if type(R) is random.Random: 83 | return R 84 | raise TypeError("get_pyrandom() does not know how to handle type {0}.".format(type(R))) 85 | 86 | 87 | -------------------------------------------------------------------------------- /milk/wrapper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/milk/abc2a28b526c199414d42c0a26092938968c3caf/milk/wrapper/__init__.py -------------------------------------------------------------------------------- /milk/wrapper/wraplibsvm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2010, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # License: MIT. See COPYING.MIT file in the milk distribution 5 | 6 | from __future__ import division 7 | from milk.supervised.classifier import normaliselabels 8 | try: 9 | from libsvm import svm as libsvm 10 | except ImportError: 11 | try: 12 | import svm as libsvm 13 | except ImportError: 14 | libsvm = None 15 | from tempfile import NamedTemporaryFile 16 | 17 | class libsvmModel(object): 18 | def __init__(self, model, names, output_probability): 19 | self.model = model 20 | self.names = names 21 | self.output_probability = output_probability 22 | 23 | def apply(self,feats): 24 | if self.output_probability: 25 | return self.model.predict_probability(feats) 26 | res = self.model.predict(feats) 27 | return self.names[int(res)] 28 | 29 | def __getstate__(self): 30 | # This is really really really hacky, but it works 31 | N = NamedTemporaryFile() 32 | self.model.save(N.name) 33 | S = N.read() 34 | return S,self.output_probability,self.names 35 | 36 | def __setstate__(self,state): 37 | if libsvm is None: 38 | raise RuntimeError('LibSVM Library not found. Cannot use this classifier.') 39 | S,self.output_probability,self.names = state 40 | N = NamedTemporaryFile() 41 | N.write(S) 42 | N.flush() 43 | self.model = libsvm.svm_model(N.name) 44 | 45 | 46 | class libsvmClassifier(object): 47 | def __init__(self,probability = False, auto_weighting = True): 48 | if libsvm is None: 49 | raise RuntimeError('LibSVM Library not found. Cannot use this classifier.') 50 | self.param = libsvm.svm_parameter(kernel_type = libsvm.RBF, probability = probability) 51 | self.output_probability = probability 52 | self.auto_weighting = auto_weighting 53 | 54 | def set_option(self,optname,value): 55 | setattr(self.param, optname, value) 56 | 57 | def train(self, features, labels): 58 | labels,names = normaliselabels(labels) 59 | if self.auto_weighting: 60 | nlabels = labels.max() + 1 61 | self.param.nr_weight = int(nlabels) 62 | self.param.weight_label = list(range(nlabels)) 63 | self.param.weight = [(labels != i).mean() for i in range(nlabels)] 64 | problem = libsvm.svm_problem(labels.astype(float), features) 65 | model = libsvm.svm_model(problem, self.param) 66 | return libsvmModel(model, names, self.output_probability) 67 | 68 | -------------------------------------------------------------------------------- /readthedocs-requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | numpydoc 3 | matplotlib 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2008-2014, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | from __future__ import division 24 | import os 25 | import platform 26 | 27 | try: 28 | import setuptools 29 | except: 30 | print(''' 31 | setuptools not found. 32 | 33 | On linux, the package is often called python-setuptools''') 34 | from sys import exit 35 | exit(1) 36 | 37 | from numpy.distutils.core import setup, Extension 38 | exec(compile(open('milk/milk_version.py').read(), 39 | 'milk/milk_version.py', 'exec')) 40 | long_description = open('README.rst').read() 41 | undef_macros = [] 42 | define_macros = [] 43 | if os.environ.get('DEBUG'): 44 | undef_macros = ['NDEBUG'] 45 | if os.environ.get('DEBUG') == '2': 46 | define_macros = [ 47 | ('_GLIBCXX_DEBUG','1'), 48 | ('EIGEN_INTERNAL_DEBUGGING', '1'), 49 | ] 50 | 51 | _extensions = { 52 | 'milk.unsupervised._kmeans' : ['milk/unsupervised/_kmeans.cpp'], 53 | 'milk.unsupervised._som' : ['milk/unsupervised/_som.cpp'], 54 | 55 | 'milk.supervised._svm' : ['milk/supervised/_svm.cpp'], 56 | 'milk.supervised._tree' : ['milk/supervised/_tree.cpp'], 57 | 'milk.supervised._perceptron' : ['milk/supervised/_perceptron.cpp'], 58 | 'milk.supervised._lasso' : ['milk/supervised/_lasso.cpp'], 59 | } 60 | 61 | compiler_args = ['-std=c++0x'] 62 | if platform.system() == 'Darwin': 63 | compiler_args.append('-stdlib=libc++') 64 | 65 | ext_modules = [ 66 | Extension(key, 67 | sources=sources, 68 | undef_macros=undef_macros, 69 | define_macros=define_macros, 70 | extra_compile_args=compiler_args, 71 | ) 72 | for key,sources in _extensions.items() 73 | ] 74 | 75 | packages = [p for p in setuptools.find_packages() 76 | if p.startswith('milk')] 77 | 78 | package_dir = { 79 | 'milk.tests': 'milk/tests', 80 | } 81 | package_data = { 82 | 'milk.tests': ['data/*'], 83 | } 84 | 85 | setup(name = 'milk', 86 | version = __version__, 87 | description = 'Machine Learning Toolkit', 88 | long_description = long_description, 89 | author = u'Luis Pedro Coelho', 90 | author_email = 'luis@luispedro.org', 91 | url = 'http://luispedro.org/software/milk', 92 | license = 'MIT', 93 | packages = packages, 94 | package_dir = package_dir, 95 | package_data = package_data, 96 | ext_modules = ext_modules, 97 | test_suite = 'nose.collector', 98 | ) 99 | 100 | 101 | -------------------------------------------------------------------------------- /template.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2011, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # License: MIT. See COPYING.MIT file in the milk distribution 5 | 6 | from __future__ import division 7 | import numpy as np 8 | 9 | --------------------------------------------------------------------------------