├── .gitignore
├── .travis.yml
├── COPYING.MIT
├── ChangeLog
├── INSTALL.rst
├── MANIFEST.in
├── Makefile
├── README.rst
├── docs
    ├── Makefile
    └── source
    │   ├── adaboost.rst
    │   ├── api.rst
    │   ├── benchmarks.rst
    │   ├── clustering.rst
    │   ├── conf.py
    │   ├── examples.rst
    │   ├── extensions.rst
    │   ├── featureselection.rst
    │   ├── index.rst
    │   ├── milksets.rst
    │   ├── nfoldcrossvalidation.rst
    │   ├── parallel.rst
    │   ├── principles.rst
    │   ├── randomforests.rst
    │   ├── randomnumbers.rst
    │   ├── readme.rst
    │   └── supervised.rst
├── get-eigen.sh
├── milk
    ├── __init__.py
    ├── active
    │   ├── __init__.py
    │   ├── eimpact.py
    │   └── uncertainty.py
    ├── demos
    │   ├── __init__.py
    │   ├── adaboost.py
    │   ├── rf_wine_2d.py
    │   └── svm-decision-boundary.py
    ├── ext
    │   ├── __init__.py
    │   └── jugparallel.py
    ├── measures
    │   ├── __init__.py
    │   ├── cluster_agreement.py
    │   ├── curves.py
    │   ├── measures.py
    │   └── nfoldcrossvalidation.py
    ├── milk_version.py
    ├── nfoldcrossvalidation.py
    ├── supervised
    │   ├── __init__.py
    │   ├── _lasso.cpp
    │   ├── _perceptron.cpp
    │   ├── _svm.cpp
    │   ├── _tree.cpp
    │   ├── adaboost.py
    │   ├── base.py
    │   ├── classifier.py
    │   ├── defaultclassifier.py
    │   ├── defaultlearner.py
    │   ├── featureselection.py
    │   ├── gridsearch.py
    │   ├── grouped.py
    │   ├── knn.py
    │   ├── lasso.py
    │   ├── logistic.py
    │   ├── multi.py
    │   ├── multi_label.py
    │   ├── multi_view.py
    │   ├── normalise.py
    │   ├── parzen.py
    │   ├── perceptron.py
    │   ├── precluster.py
    │   ├── precluster_learner.py
    │   ├── randomforest.py
    │   ├── set2binary_array.py
    │   ├── svm.py
    │   ├── tree.py
    │   └── weighted_voting_adaboost.py
    ├── tests
    │   ├── __init__.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── jugparallel_jugfile.py
    │   │   ├── jugparallel_kmeans_jugfile.py
    │   │   └── regression-2-Dec-2009.pp.gz
    │   ├── fast_classifier.py
    │   ├── test_adaboost.py
    │   ├── test_affinity.py
    │   ├── test_basic.py
    │   ├── test_curves.py
    │   ├── test_defaultclassifier.py
    │   ├── test_defaultlearner.py
    │   ├── test_ecoc_learner.py
    │   ├── test_ext_jugparallel.py
    │   ├── test_featureselection.py
    │   ├── test_fisher.py
    │   ├── test_gaussianmixture.py
    │   ├── test_gridsearch.py
    │   ├── test_grouped.py
    │   ├── test_kmeans.py
    │   ├── test_knn.py
    │   ├── test_lasso.py
    │   ├── test_logistic.py
    │   ├── test_measures.py
    │   ├── test_measures_clusters.py
    │   ├── test_multi.py
    │   ├── test_multi_label.py
    │   ├── test_multi_view.py
    │   ├── test_nfoldcrossvalidation.py
    │   ├── test_nfoldcrossvalidation_regression.py
    │   ├── test_nnmf.py
    │   ├── test_normalise.py
    │   ├── test_normaliselabels.py
    │   ├── test_parzen.py
    │   ├── test_pca.py
    │   ├── test_pdist.py
    │   ├── test_perceptron.py
    │   ├── test_precluster_learner.py
    │   ├── test_regression.py
    │   ├── test_regression_constant_features.py
    │   ├── test_rf.py
    │   ├── test_set2binary_array.py
    │   ├── test_som.py
    │   ├── test_svm.py
    │   ├── test_svm_sigmoidal.py
    │   ├── test_tree.py
    │   └── test_utils.py
    ├── unsupervised
    │   ├── __init__.py
    │   ├── _kmeans.cpp
    │   ├── _som.cpp
    │   ├── affinity.py
    │   ├── gaussianmixture.py
    │   ├── kmeans.py
    │   ├── nnmf
    │   │   ├── __init__.py
    │   │   ├── hoyer.py
    │   │   └── lee_seung.py
    │   ├── normalise.py
    │   ├── parzen.py
    │   ├── pca.py
    │   ├── pdist.py
    │   └── som.py
    ├── utils
    │   ├── __init__.py
    │   ├── parallel.py
    │   ├── utils.h
    │   └── utils.py
    └── wrapper
    │   ├── __init__.py
    │   └── wraplibsvm.py
├── readthedocs-requirements.txt
├── setup.py
└── template.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | milk/supervised/_svm.so
 3 | milk/supervised/_tree.so
 4 | milk/supervised/_perceptron.so
 5 | milk/supervised/_lasso.so
 6 | milk/unsupervised/_som.so
 7 | milk/unsupervised/_kmeans.so
 8 | build
 9 | dist/
10 | milk.egg-info
11 | docs/milk/
12 | *.cpython*.so
13 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 |   - "3.3"
 5 |   - "3.4"
 6 | before_install:
 7 |     - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
 8 |     - chmod +x miniconda.sh
 9 |     - ./miniconda.sh -b
10 |     - export PATH=/home/travis/miniconda/bin:$PATH
11 |     - conda update --yes conda
12 |     - sudo apt-get update -qq
13 |     - sudo apt-get install -qq libatlas-dev liblapack-dev gfortran
14 |     - sudo apt-get install -qq libeigen3-dev
15 | install:
16 |     - conda create  --yes -n condaenv python=$TRAVIS_PYTHON_VERSION numpy=1.9
17 |     - conda install --yes -n condaenv scipy matplotlib pillow nose pip
18 |     - conda install --yes -n condaenv -c https://conda.binstar.org/luispedro imread
19 |     - source activate condaenv
20 |     - pip install --quiet coveralls
21 |     - pip install milksets
22 |     - make debug
23 | script: nosetests
24 | 


--------------------------------------------------------------------------------
/COPYING.MIT:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2008-2011 Luis Pedro Coelho <luis@luispedro.org>
 2 | 
 3 | Permission is hereby granted, free of charge, to any person
 4 | obtaining a copy of this software and associated documentation
 5 | files (the "Software"), to deal in the Software without
 6 | restriction, including without limitation the rights to use,
 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the
 9 | Software is furnished to do so, subject to the following
10 | conditions:
11 | 
12 | The above copyright notice and this permission notice shall be
13 | included in all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/INSTALL.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | Building milk
 3 | =============
 4 | 
 5 | To install dependencies in Ubuntu::
 6 | 
 7 |     sudo apt-get install python-numpy python-scipy libeigen3-dev
 8 | 
 9 | The following should work::
10 | 
11 |     python setup.py install
12 | 
13 | A C++ compiler is required. On Windows, you might need to specify the compiler.
14 | For example::
15 | 
16 |     python setup.py install --compiler=mingw32
17 | 
18 | If you have mingw installed.
19 | 
20 | ---------------
21 | Building on OSX
22 | ---------------
23 | 
24 | Because the standard library used with OS X doesn't include the C++11 libraries by default, you will need to specify
25 | it in ``setup.py``::
26 | 
27 |     extra_compile_args=['-std=c++0x', '-stdlib=libc++'],
28 | 
29 | is what the final line should look like.
30 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include INSTALL.rst
3 | include COPYING.MIT
4 | include milk/tests/data/*
5 | include milk/utils/utils.h
6 | recursive-include milk/supervised/eigen3 *
7 | 
8 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SOURCES = milk/*/*.cpp
 2 | 
 3 | debug: $(SOURCES)
 4 | 	DEBUG=2 python setup.py build --build-lib=.
 5 | 
 6 | fast: $(SOURCES)
 7 | 	python setup.py build --build-lib=.
 8 | 
 9 | clean:
10 | 	rm -rf build milk/*/*.so
11 | 
12 | tests: debug
13 | 	nosetests -vx
14 | 
15 | docs:
16 | 	rm -rf build/docs
17 | 	cd docs && make html && cp -r build/html ../build/docs
18 | 	@echo python setup.py upload_docs
19 | 
20 | .PHONY: clean docs tests fast debug
21 | 
22 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ==============================
  2 | MILK: MACHINE LEARNING TOOLKIT
  3 | ==============================
  4 | Machine Learning in Python
  5 | --------------------------
  6 | 
  7 | Milk is a machine learning toolkit in Python.
  8 | 
  9 | Its focus is on supervised classification with several classifiers available:
 10 | SVMs (based on libsvm), k-NN, random forests, decision trees. It also performs
 11 | feature selection. These classifiers can be combined in many ways to form
 12 | different classification systems.
 13 | 
 14 | For unsupervised learning, milk supports k-means clustering and affinity
 15 | propagation.
 16 | 
 17 | Milk is flexible about its inputs. It optimised for numpy arrays, but can often
 18 | handle anything (for example, for SVMs, you can use any dataype and any kernel
 19 | and it does the right thing).
 20 | 
 21 | There is a strong emphasis on speed and low memory usage. Therefore, most of
 22 | the performance sensitive code is in C++. This is behind Python-based
 23 | interfaces for convenience.
 24 | 
 25 | To learn more, check the docs at `http://packages.python.org/milk/
 26 | <http://packages.python.org/milk/>`_ or the code demos included with the source
 27 | at ``milk/demos/``.
 28 | 
 29 | Examples
 30 | --------
 31 | 
 32 | Here is how to test how well you can classify some ``features,labels`` data,
 33 | measured by cross-validation::
 34 | 
 35 |     import numpy as np
 36 |     import milk
 37 |     features = np.random.rand(100,10) # 2d array of features: 100 examples of 10 features each
 38 |     labels = np.zeros(100)
 39 |     features[50:] += .5
 40 |     labels[50:] = 1
 41 |     confusion_matrix, names = milk.nfoldcrossvalidation(features, labels)
 42 |     print 'Accuracy:', confusion_matrix.trace()/float(confusion_matrix.sum())
 43 | 
 44 | If want to use a classifier, you instanciate a *learner object* and call its
 45 | ``train()`` method::
 46 | 
 47 |     import numpy as np
 48 |     import milk
 49 |     features = np.random.rand(100,10)
 50 |     labels = np.zeros(100)
 51 |     features[50:] += .5
 52 |     labels[50:] = 1
 53 |     learner = milk.defaultclassifier()
 54 |     model = learner.train(features, labels)
 55 | 
 56 |     # Now you can use the model on new examples:
 57 |     example = np.random.rand(10)
 58 |     print model.apply(example)
 59 |     example2 = np.random.rand(10)
 60 |     example2 += .5
 61 |     print model.apply(example2)
 62 |     
 63 | There are several classification methods in the package, but they all use the
 64 | same interface: ``train()`` returns a *model* object, which has an ``apply()``
 65 | method to execute on new instances.
 66 | 
 67 | 
 68 | Details
 69 | -------
 70 | License: MIT
 71 | 
 72 | Author: Luis Pedro Coelho (with code from LibSVM and scikits.learn)
 73 | 
 74 | API Documentation: `http://packages.python.org/milk/ <http://packages.python.org/milk/>`_
 75 | 
 76 | Mailing List: `http://groups.google.com/group/milk-users
 77 | <http://groups.google.com/group/milk-users>`__
 78 | 
 79 | Features
 80 | --------
 81 | - SVMs. Using the libsvm solver with a pythonesque wrapper around it.
 82 | - LASSO
 83 | - K-means using as little memory as possible. It can cluster millions of
 84 |   instances efficiently.
 85 | - Random forests
 86 | - Self organising maps
 87 | - Stepwise Discriminant Analysis for feature selection.
 88 | - Non-negative matrix factorisation
 89 | - Affinity propagation
 90 | 
 91 | Recent History
 92 | --------------
 93 | 
 94 | The ChangeLog file contains a more complete history.
 95 | 
 96 | New in 0.6.1 (11 May 2015)
 97 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
 98 | - Fixed source distribution
 99 | 
100 | New in 0.6 (27 Apr 2015)
101 | ~~~~~~~~~~~~~~~~~~~~~~~~
102 | - Update for Python 3
103 | 
104 | New in 0.5.3 (19 Jun 2013)
105 | ~~~~~~~~~~~~~~~~~~~~~~~~~
106 | - Fix MDS for non-array inputs
107 | - Fix MDS bug
108 | - Add return_* arguments to kmeans
109 | - Extend zscore() to work on non-ndarrays
110 | - Add frac_precluster_learner
111 | - Work with older C++ compilers
112 | 
113 | 
114 | New in 0.5.2 (7 Mar 2013)
115 | ~~~~~~~~~~~~~~~~~~~~~~~~~
116 | - Fix distribution of Eigen with source
117 | 
118 | New in 0.5.1 (11 Jan 2013)
119 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
120 | - Add subspace projection kNN
121 | - Export ``pdist`` in milk namespace
122 | - Add Eigen to source distribution
123 | - Add measures.curves.roc
124 | - Add ``mds_dists`` function
125 | - Add ``verbose`` argument to milk.tests.run
126 | 
127 | 
128 | New in 0.5 (05 Nov 2012)
129 | ~~~~~~~~~~~~~~~~~~~~~~~~
130 | - Add coordinate-descent based LASSO
131 | - Add unsupervised.center function
132 | - Make zscore work with NaNs (by ignoring them)
133 | - Propagate apply_many calls through transformers
134 | - Much faster SVM classification with means a much faster defaultlearner()
135 |   [measured 2.5x speedup on yeast dataset!]
136 | 
137 | 
138 | For older versions, see ``ChangeLog`` file
139 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | PAPER         =
 8 | 
 9 | # Internal variables.
10 | PAPEROPT_a4     = -D latex_paper_size=a4
11 | PAPEROPT_letter = -D latex_paper_size=letter
12 | ALLSPHINXOPTS   = -d build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
13 | 
14 | .PHONY: help clean html web pickle htmlhelp latex changes linkcheck
15 | 
16 | help:
17 | 	@echo "Please use \`make <target>' where <target> is one of"
18 | 	@echo "  html      to make standalone HTML files"
19 | 	@echo "  pickle    to make pickle files"
20 | 	@echo "  json      to make JSON files"
21 | 	@echo "  htmlhelp  to make HTML files and a HTML help project"
22 | 	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
23 | 	@echo "  changes   to make an overview over all changed/added/deprecated items"
24 | 	@echo "  linkcheck to check all external links for integrity"
25 | 
26 | clean:
27 | 	-rm -rf build/*
28 | 
29 | html:
30 | 	mkdir -p build/html build/doctrees
31 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) build/html
32 | 	@echo
33 | 	@echo "Build finished. The HTML pages are in build/html."
34 | 
35 | pickle:
36 | 	mkdir -p build/pickle build/doctrees
37 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) build/pickle
38 | 	@echo
39 | 	@echo "Build finished; now you can process the pickle files."
40 | 
41 | web: pickle
42 | 
43 | json:
44 | 	mkdir -p build/json build/doctrees
45 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) build/json
46 | 	@echo
47 | 	@echo "Build finished; now you can process the JSON files."
48 | 
49 | htmlhelp:
50 | 	mkdir -p build/htmlhelp build/doctrees
51 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) build/htmlhelp
52 | 	@echo
53 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
54 | 	      ".hhp project file in build/htmlhelp."
55 | 
56 | latex:
57 | 	mkdir -p build/latex build/doctrees
58 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) build/latex
59 | 	@echo
60 | 	@echo "Build finished; the LaTeX files are in build/latex."
61 | 	@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
62 | 	      "run these through (pdf)latex."
63 | 
64 | changes:
65 | 	mkdir -p build/changes build/doctrees
66 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) build/changes
67 | 	@echo
68 | 	@echo "The overview file is in build/changes."
69 | 
70 | linkcheck:
71 | 	mkdir -p build/linkcheck build/doctrees
72 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) build/linkcheck
73 | 	@echo
74 | 	@echo "Link check complete; look for any errors in the above output " \
75 | 	      "or in build/linkcheck/output.txt."
76 | 


--------------------------------------------------------------------------------
/docs/source/adaboost.rst:
--------------------------------------------------------------------------------
 1 | ========
 2 | AdaBoost
 3 | ========
 4 | 
 5 | Adaboost
 6 | --------
 7 | 
 8 | This example is available as part of milk as ``milk/demos/adaboost.py``.
 9 | 
10 | Adaboost is based on a weak learner. For this example, we are going to use a
11 | stump learner::
12 | 
13 |     import milk.supervised.tree
14 |     import milk.supervised.adaboost
15 | 
16 |     weak = milk.supervised.tree.stump_learner()
17 |     learner = milk.supervised.adaboost.boost_learner(weak)
18 | 
19 | Currently, only binary classification is implemented for ``boost_learner``.
20 | Therefore, we need to use a converter, in this case, using the *one versus one*
21 | strategy::
22 | 
23 |     import milk.supervised.multi
24 |     learner = milk.supervised.multi.one_against_one(learner)
25 | 
26 | Now, we can use this learner as we would normally do. For example, for
27 | cross-validation::
28 | 
29 |     from milksets import wine
30 |     features, labels = wine.load()
31 |     cmat,names,predictions = \
32 |         milk.nfoldcrossvalidation(features, \
33 |                                     labels, \
34 |                                     classifier=learner, \
35 |                                     return_predictions=True)
36 | 
37 | We display just the first two dimensions here using circles for correct
38 | predictions and crosses for mis-matches. The colour represents the underlying
39 | class::
40 | 
41 |     import pylab as plt
42 |     colors = "rgb"
43 |     codes = "xo"
44 |     for y,x,r,p in zip(features.T[0], features.T[1], labels, predictions):
45 |         code = codes[int(r == p)]
46 |         plt.plot([y],[x], colors[p]+code)
47 |     plt.show()
48 | 
49 | .. plot:: ../../milk/demos/adaboost.py
50 |     :include-source:
51 | 
52 | API Documentation
53 | -----------------
54 | 
55 | .. automodule:: milk.supervised.adaboost
56 |     :members: boost_learner
57 | 
58 | 


--------------------------------------------------------------------------------
/docs/source/api.rst:
--------------------------------------------------------------------------------
 1 | =================
 2 | API Documentation
 3 | =================
 4 | 
 5 | .. automodule:: milk
 6 |     :members:
 7 | 
 8 | .. automodule:: milk.supervised
 9 |     :members:
10 | 
11 | .. automodule:: milk.unsupervised
12 |     :members:
13 | 
14 | .. automodule:: milk.measures
15 |     :members:
16 | 
17 | .. automodule:: milk.utils
18 |     :members:
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/docs/source/benchmarks.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Benchmarks
 3 | ==========
 4 | 
 5 | Scikits.learn benchmark
 6 | -----------------------
 7 | 
 8 | This is from a benchmark developed by the `scikits.learn team
 9 | <https://github.com/scikit-learn/ml-benchmarks>`__. I ran it on my Intel Core2
10 | 6600, 2.40GHz CPU.
11 | 
12 | .. table:: Results in scikits.learn ml-benchmarks
13 | 
14 |      ============         =======           ======          =======         ========    =============         ======== 
15 |         Benchmark          PyMVPA           Shogun          Pybrain             MLPy    scikits.learn             milk
16 |      ============         =======           ======          =======         ========    =============         ======== 
17 |              knn          **1.0**             2.23               --             2.23             3.05             2.20
18 |       elasticnet               --               --               --           174.43          **1.0**               --
19 |        lassolars               --               --               --            61.67          **1.0**               --
20 |              pca               --               --               --               --          **1.0**            11.11
21 |           kmeans               --             2.02          7057.02             1.61             6.74          **1.0**
22 |              svm             3.35             1.20               --               --             1.24          **1.0**
23 |      ============         =======           ======          =======         ========    =============         ======== 
24 | 
25 | 
26 | All of the results are normalised by the fastest system for each entry (which
27 | is therefore, by definition, 1.0).
28 | 
29 | So, except for PCA, milk *is pretty fast* and for kmeans and SVM learning it is
30 | the fastest system.
31 | 
32 | Limitations of This Benchmark
33 | -----------------------------
34 | 
35 | 1. It is very small dataset, so you do not get a feeling of how it scales.
36 | 2. It is only one dataset.
37 | 3. Since the benchmark came out, I made some changes to milk to make it go
38 |    faster. I hope that other systems do the same, though, so we can have good
39 |    progress.
40 | 
41 | 


--------------------------------------------------------------------------------
/docs/source/clustering.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Clustering
 3 | ==========
 4 | 
 5 | K-Means
 6 | -------
 7 | 
 8 | K-means is one of the simplest, but often most effective, clustering
 9 | algorithms. milk supports k-means through the ``milk.kmeans`` function:
10 | 
11 | ::
12 | 
13 |     features = np.random.randn(100,20)
14 |     features[:50] *= 2
15 | 
16 |     k = 2
17 |     cluster_ids, centroids = milk.kmeans(features, k)
18 | 
19 | The milk implementation is very fast and can handle large amounts of data. In
20 | an effort to make it scale to millions of data points, the author of milk even
21 | included new features in numpy. If you happen to run numpy 1.6 or newer, then
22 | milk will pick it up and run faster with less memory.
23 | 
24 | Milk has been used to cluster datasets with over 5 million data points and over
25 | 100 features per data point. You need enough RAM to handle the data matrix and
26 | the distance matrix (NxK) and a little extra, but milk is very careful not to
27 | allocate any more memory than it needs.
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/docs/source/examples.rst:
--------------------------------------------------------------------------------
 1 | ========
 2 | Examples
 3 | ========
 4 | 
 5 | .. toctree::
 6 |    :maxdepth: 2
 7 | 
 8 |    randomforests
 9 | 
10 | 


--------------------------------------------------------------------------------
/docs/source/extensions.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Extensions
 3 | ==========
 4 | 
 5 | An extension adds some functionality that is either not really core or requires
 6 | extra dependencies. Currently, the only extension is a little `jug
 7 | <http://luispedro.org/software/jug>`__ based function for *parallel
 8 | nfoldcrossvalidation*.
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/docs/source/featureselection.rst:
--------------------------------------------------------------------------------
 1 | ===================================
 2 | Feature Normalisation and Selection
 3 | ===================================
 4 | 
 5 | For many problems, feature normalisation and selection is a 
 6 | 
 7 | Simple Normalisations
 8 | ---------------------
 9 | 
10 | Fill in ``NaNs`` and ``Infs``: the ``checkfinite()`` learner does this. This
11 | learner does not use any of its input features: it always returns the same
12 | model.
13 | 
14 | Whiten
15 | ------
16 | 
17 | Checkout the functions ``zscore()`` if you have a feature matrix or the
18 | ``zscore_normalise()`` learner.
19 | 
20 | Stepwise Discriminant Analysis
21 | ------------------------------
22 | 
23 | Stepwise Discriminant Analysis (SDA) is a simple feature selection method. It
24 | is supervised and independent of the downstream classifier.
25 | 
26 | **Important Note**: SDA does not work well if your features are linearly
27 | dependent. Filter out linearly dependent features before calling SDA (use
28 | ``linearly_dependent_features``).
29 | 
30 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. include:: readme.rst
 2 | 
 3 | Contents:
 4 | 
 5 | .. toctree::
 6 |    :maxdepth: 2
 7 | 
 8 |    supervised
 9 |    nfoldcrossvalidation
10 |    featureselection
11 |    adaboost
12 |    clustering
13 |    randomnumbers
14 |    examples
15 |    parallel
16 |    extensions
17 |    benchmarks
18 |    milksets
19 |    api
20 | 
21 | Indices and tables
22 | ==================
23 | 
24 | * :ref:`genindex`
25 | * :ref:`search`
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/source/milksets.rst:
--------------------------------------------------------------------------------
 1 | ========
 2 | Milksets
 3 | ========
 4 | 
 5 | Milksets is a separate package that contains a few UCI datasets in a format
 6 | that is easy to handle with milk.
 7 | 
 8 | It is mostly useful for testing, playing around.
 9 | 
10 | 
11 | You can install it from pypi with::
12 | 
13 |     pip install milksets
14 | 
15 | or::
16 | 
17 |     easy_install milksets
18 | 
19 | Links
20 | -----
21 | 
22 | - `github <https://github.com/luispedro/milksets>`__
23 | - `homepage <http://luispedro.org/software/milksets>`__
24 | 
25 | 


--------------------------------------------------------------------------------
/docs/source/nfoldcrossvalidation.rst:
--------------------------------------------------------------------------------
 1 | ================
 2 | Cross-validation
 3 | ================
 4 | 
 5 | Cross validation is one of the better ways to evaluate the performance of
 6 | supervised classification.
 7 | 
 8 | Cross validation consists of separating the data into fold (hence the name
 9 | _n_-fold cross-validation, where _n_ is a positive integer). For the purpose o
10 | this discussion, we consider 10 folds. In the first round, we leave the first
11 | fold out. This means we train on the other 9 folds and then evaluate the model
12 | on this left-out fold. On the second round, we leave the second fold out. This
13 | continues until every fold has been left out exactly once.
14 | 
15 | Milk support what is often explicitly called *stratified cross validation*,
16 | which means that it takes the class distributions into account (so that, in 10
17 | fold cross validation, each fold will have 10% of each class per round).
18 | 
19 | An additional functionality, not normally found in machine learning packages or
20 | in machine learning theory, but very useful in practice is the use of the
21 | ``origins`` parameter. Every datapoint can have an associated *origin*. This is
22 | a an integer and its meaning is the following: all examples with the same
23 | origin will be in the same fold (so testing will never be performed where there
24 | was an object of the same origin used for training).
25 | 
26 | This can model cases such as the following: you have collected patient data,
27 | which includes both some health measurement and an outcome of interest (for
28 | example, how the patient was doing a year after the initial exam). You wish to
29 | evaluate a supervised classification algorithm for predicting outcomes. In
30 | particular, you wish for an estimate of how well the system would perform on
31 | patients in any location (you know that the data collection has some site
32 | effects, perhaps because each person runs the test a little bit differently).
33 | Fortunately, you have the data to test this: the patients come from several
34 | clinics. Now, you set each patient origin to be the ID of the clinic and
35 | evaluate the per patient accuracy.
36 | 
37 | 
38 | API Documentation
39 | -----------------
40 | 
41 | .. automodule:: milk.measures.nfoldcrossvalidation
42 |     :members:
43 | 
44 | 


--------------------------------------------------------------------------------
/docs/source/parallel.rst:
--------------------------------------------------------------------------------
  1 | ===================
  2 | Parallel Processing
  3 | ===================
  4 | 
  5 | .. versionadded:: 0.3.10
  6 |    Jug integration was added in version 0.3.10. Parallel processing was added
  7 |    with 0.4.0
  8 | 
  9 | There is certain functionality in milk which is *embarassingly parallel* (or
 10 | almost so). Therefore, milk has some support for using multiprocessors and
 11 | computing clusters.
 12 | 
 13 | Jug Integration
 14 | ---------------
 15 | 
 16 | .. versionadded:: 0.3.10
 17 |    Jug integration requires `jug <http://luispedro.org/software/jug>`__
 18 | 
 19 | Currently, there is support for running n-fold crossvalidation as multiple jug
 20 | tasks, which jug can then partition across multiple processors (or computers in
 21 | a cluster).
 22 | 
 23 | Example
 24 | ~~~~~~~
 25 | 
 26 | ::
 27 |     from milk.ext.jugparallel import nfoldcrossvalidation
 28 | 
 29 |     # For this example, we rely on milksets
 30 |     from milksets.wine import load
 31 | 
 32 |     # Load the data
 33 |     features, labels = load()
 34 | 
 35 |     cmatrix = nfoldcrossvalidation(features, labels)
 36 | 
 37 | 
 38 | Save this as ``example.py`` and, now, you can run ``jug execute example.py`` to
 39 | perform 10-fold cross-validation. Each fold will be its own Task and can be run
 40 | independently of the others.
 41 | 
 42 | Multiprocessing
 43 | ---------------
 44 | 
 45 | .. versionadded:: 0.4
 46 | 
 47 | There are some opportunities for parallel processing which are hard to fit into
 48 | the Jug framework (which is limited to coarse grained parallelisation). For
 49 | example, choosing the parameters of a learner (e.g., the SVM learner) through
 50 | cross-validation, has a high degree of parallelisation, but is hard to fit into
 51 | the jug framework without (1) restructuring the code and (2) doing unnecessary
 52 | computation.
 53 | 
 54 | Therefore, milk can use multiple processes for this operation, using the Python
 55 | ``multiprocessing`` module.
 56 | 
 57 | Currently, by default, *this functionality is disabled.* Change the value of
 58 | ``milk.utils.parallel.max_procs`` to enable it.
 59 | 
 60 | Over time, more functionality will take advantage of multiple cores.
 61 | 
 62 | Example
 63 | ~~~~~~~
 64 | 
 65 | This is a simple example, which relies on `milksets
 66 | <http://luispedro.org/software/milksets>`__ just for convenience (you could use
 67 | any other labeled feature set.
 68 | 
 69 | As you can see, you do not have to do anything except call
 70 | ``milk.utils.parallel.set_max_procs()`` to enable multiprocessing (calling it
 71 | without an argument sets the number of processes to the number of CPUs).
 72 | 
 73 | ::
 74 | 
 75 |     import milk
 76 | 
 77 |     # Import the parallel module
 78 |     from milk.utils import parallel
 79 | 
 80 |     # For this example, we rely on milksets
 81 |     from milksets.wine import load
 82 | 
 83 |     # Use all available processors
 84 |     parallel.set_max_processors()
 85 | 
 86 |     # Load the data
 87 |     features, labels = load()
 88 |     learner = milk.defaultlearner()
 89 |     model = learn.train(features[::2], labels[::2])
 90 |     held_out = map(model.apply, features[1::2])
 91 |     print np.mean(labels[1::2] == held_out)
 92 | 
 93 | 
 94 | Naturally, you can combine both of these features::
 95 | 
 96 |     from milk.ext.jugparallel import nfoldcrossvalidation
 97 |     # Import the parallel module
 98 |     from milk.utils import parallel
 99 | 
100 |     # For this example, we rely on milksets
101 |     from milksets.wine import load
102 | 
103 |     # Use all available processors
104 |     parallel.set_max_processors()
105 | 
106 |     # Load the data
107 |     features, labels = load()
108 | 
109 |     cmatrix = nfoldcrossvalidation(features, labels)
110 | 
111 | This is now a jug script which uses all available processors. This is ideal if
112 | you have a cluster of machines with multiple cores per machine. You can run
113 | different folds on different machines and, internally, each fold will use all
114 | the cores on its machine.
115 | 
116 | Naturally, if you run multiple folds on the same machine, they will end up
117 | fighting for the same cores and you will get no speedup.
118 | 
119 | 


--------------------------------------------------------------------------------
/docs/source/principles.rst:
--------------------------------------------------------------------------------
 1 | Principles of Milk
 2 | -------------------
 3 | 
 4 | Play Well With Others
 5 | ~~~~~~~~~~~~~~~~~~~~~
 6 | 
 7 | This is the basic principle of milk: it should play well with others. It means
 8 | that its interfaces should, as much as possible, be flexible.
 9 | 
10 | Be Liberal With What you Accept. Be Conservative With What Your Produce
11 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
12 | 
13 | Don't be fussy about input parameters, but specified very careful your outputs.
14 | 
15 | Work Interactively
16 | ~~~~~~~~~~~~~~~~~~
17 | 
18 | This means that building a classifier should look like this::
19 | 
20 |     classifier = milk.default_classifier(data,labels)
21 | 
22 | and not like this::
23 | 
24 |     classifier = milk.concattransforms(
25 |                 milk.chkfinite(),
26 |                 milk.to_interval(1,-1),
27 |                 milk.pick_best(f=0.10),
28 |                 milk.binary_to_multi(mode='1-vs-1',
29 |                     base=milk.supervised.gridsearch(
30 |                         base=milk.svm_binary(base=milk.svm_libsvm()),
31 |                         params={ 
32 |                             'C' : [2**c for c in xrange(-7,4)],
33 |                             'kernel' : [milk.rbf_kernel(2**w) for w in xrange(-4,2)])))
34 |     container = milk.container()
35 |     for col in len(data[0]):
36 |         container.set_column(col,milk.CONTINUOUS)
37 |     container.set_data(data)
38 |     labelcontainer = milk.labelcontainer()
39 |     labelcontainer.set_type(milk.STRING)
40 |     labelcontainer.set_data(labels)
41 | 
42 |     classifier.train(container,labelcontainer)
43 | 
44 | This often means that one might have a more complete interface internally and
45 | another interface for interactive use on top (see Matplotlib_ for a good
46 | example of this).
47 | 
48 | .. _Matplotlib: http://matplotlib.sourceforge.net/
49 | 
50 | 
51 | Don't Impose Yourself
52 | ~~~~~~~~~~~~~~~~~~~~~
53 | 
54 | Don't assume that people are writing their software around your library, which
55 | translates into:
56 | 
57 |     * Don't impose your file format.
58 |     * Don't impose your in-memory data format.
59 | 
60 | Be Pythonic
61 | ~~~~~~~~~~~
62 | 
63 | In general, be a true Python library (and not just a wrapper around something
64 | else). For example:
65 | 
66 | - If an SVM classifier takes a kernel as a parameter, then it should accept any
67 |   2-argument Python function (in fact, anything that's callable in Python).
68 | - Objects (like classifiers) should be pickle-able.
69 | 
70 | You Don't Pay For What You Don't Use
71 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
72 | 
73 | Flexibility should come with the lowest-possible cost. If a cost is
74 | unavoidable, it should be paid by those who use the flexibility and not by
75 | everybody else.
76 | 
77 | 


--------------------------------------------------------------------------------
/docs/source/randomforests.rst:
--------------------------------------------------------------------------------
 1 | ====================
 2 | Using Random Forests
 3 | ====================
 4 | 
 5 | If you are not familiar with random forests, in general, `Wikipedia
 6 | <http://en.wikipedia.org/wiki/Random_forest>`__ is a good place to start
 7 | reading. The current article deals only with how to use them in **milk**.
 8 | 
 9 | Random forests as implemented in milk are *binary classifiers*, so you need to
10 | use a transformer to turn them into multi-class learners if you have
11 | multi-class data.
12 | 
13 | ::
14 | 
15 |     from milk.supervised import randomforest
16 |     from milk.supervised.multi import one_against_one
17 | 
18 |     rf_learner = randomforest.rf_learner()
19 |     learner = one_against_one(rf_learner)
20 | 
21 | This is just another learner type, which we can use to train a model::
22 | 
23 |     from milksets import wine
24 |     features, labels = wine.load()
25 |     model = learner.train(features, labels)
26 | 
27 | or to perform cross-validation::
28 |     
29 |     cmat,names, preds = milk.nfoldcrossvalidation(features, labels, classifier=learner, return_predictions=1)
30 | 
31 | If you have `milksets <milksets.html>`__ installed, you can try it on one of its datasets::
32 | 
33 |     from milksets import wine
34 |     features, labels = wine.load()
35 |     cmat,names, preds = milk.nfoldcrossvalidation(features, labels, classifier=learner, return_predictions=1)
36 | 
37 | We can finally plot the results (mapped to 2 dimensions using PCA):
38 | 
39 | .. plot:: ../../milk/demos/rf_wine_2d.py
40 |     :include-source:
41 | 
42 | Colours indicate the classification output. A circle means that it matches the
43 | underlying label, a cross that it was a mis-classification.
44 | 
45 | 


--------------------------------------------------------------------------------
/docs/source/randomnumbers.rst:
--------------------------------------------------------------------------------
 1 | ==============
 2 | Random Numbers
 3 | ==============
 4 | How milk handles random numbers
 5 | -------------------------------
 6 | 
 7 | Many algorithms (e.g., `kmeans`) require random number initialisation.
 8 | 
 9 | In `milk`, all functions that internally use random numbers take an `R`
10 | parameter. If left unspecified (or set to `None`), then it means that the
11 | internal initialisation should be used.
12 | 
13 | `R` can be specified by an integer, a `random.Random` instance, or a
14 | `numpy.RandomState` instance. If the same `R` is passed twice to the function,
15 | then the results are deterministic.
16 | 
17 | Functions that use random numbers
18 | ---------------------------------
19 | 
20 | - `kmeans`: for initial cluster choice.
21 | - `repeated_kmeans`: for use in `kmeans` internally.
22 | - `som`: for initial choice of points.
23 | - `nnmf` and `sparse_nnmf`: for initialisation.
24 | 
25 | ``random`` and ``numpy.random``
26 | -------------------------------
27 | 
28 | There are two randomness mechanisms used internally by `milk`: `random` (the
29 | standard Python package) and `numpy.random`. Setting the seed on just one of
30 | them will not be enough. You need to set *both*. This is in alternative to
31 | using the `R` technique outlined above.
32 | 
33 | 


--------------------------------------------------------------------------------
/docs/source/readme.rst:
--------------------------------------------------------------------------------
 1 | ==============================
 2 | MILK: MACHINE LEARNING TOOLKIT
 3 | ==============================
 4 | Machine Learning in Python
 5 | --------------------------
 6 | 
 7 | Milk is a machine learning toolkit in Python.
 8 | 
 9 | Its focus is on supervised classification with several classifiers available:
10 | SVMs (based on libsvm), k-NN, random forests, decision trees. It also performs
11 | feature selection. These classifiers can be combined in many ways to form
12 | different classification systems.
13 | 
14 | For unsupervised learning, milk supports k-means clustering and affinity
15 | propagation.
16 | 
17 | Milk is flexible about its inputs. It optimised for numpy arrays, but can often
18 | handle anything (for example, for SVMs, you can use any dataype and any kernel
19 | and it does the right thing).
20 | 
21 | There is a strong emphasis on speed and low memory usage. Therefore, most of
22 | the performance sensitive code is in C++. This is behind Python-based
23 | interfaces for convenience.
24 | 
25 | Features
26 | --------
27 | - Random forests
28 | - Self organising maps
29 | - SVMs. Using the libsvm solver with a pythonesque wrapper around it.
30 | - Stepwise Discriminant Analysis for feature selection.
31 | - Non-negative matrix factorisation
32 | - K-means using as little memory as possible.
33 | - Affinity propagation
34 | 
35 | License: MIT
36 | Author: Luis Pedro Coelho (with code from LibSVM and scikits.learn)
37 | Website: `http://luispedro.org/software/milk
38 | <http://luispedro.org/software/milk>`__
39 | API Documentation: `http://packages.python.org/milk/ <http://packages.python.org/milk/>`_
40 | 


--------------------------------------------------------------------------------
/docs/source/supervised.rst:
--------------------------------------------------------------------------------
 1 | =========================
 2 | Supervised Classification
 3 | =========================
 4 | 
 5 | Supervised learning takes in both a set of *input features* and their
 6 | corresponding *labels* to produce a model which can then be fed an unknown
 7 | instance and produce a label for it.
 8 | 
 9 | Typical supervised learning models are SVMs and decision trees.
10 | 
11 | Example
12 | -------
13 | ::
14 | 
15 |     features = np.random.randn(100,20)
16 |     features[:50] *= 2
17 |     labels = np.repeat((0,1), 50)
18 | 
19 |     classifier = milk.defaultclassifier()
20 |     model = classifier.train(features, labels)
21 |     new_label = model.apply(np.random.randn(20))
22 |     new_label2 = model.apply(np.random.randn(20)*2)
23 | 
24 | Learners
25 | --------
26 | 
27 | All learners have a ``train`` function which takes 2 at least arguments:
28 | - features : sequence of features
29 | - labels : sequence of labels
30 | 
31 | (They may take more parameters).
32 | 
33 | They return a *model* object, which has an ``apply`` function which takes a
34 | single input and returns its label.
35 | 
36 | Note that there are always two objects: the learned and the model and they are
37 | independent. Every time you call ``learner.train()`` you get a new model. This
38 | is different from the typical interface where you first call ``train()`` and
39 | later ``apply()`` (or equivalent names) on the same object. This is a better
40 | interface because the type system protects you against calling ``apply()`` on
41 | the wrong object and because it often the case that you want to learn several
42 | models with the same learner. The only disadvantage is that the word
43 | *classifier* can be used for both, so in the documentation, we always refer to
44 | *models* and *classifiers.*
45 | 
46 | Both learners and models are pickle()able.
47 | 
48 | Composition and Defaults
49 | ------------------------
50 | 
51 | The style of milk involves many small objects,each providing one step of the
52 | pipeline. For example:
53 | 
54 | 1. remove NaNs and Infs from features
55 | 2. bring features to the [-1, 1] interval
56 | 3. feature selection by removing linearly dependent features and then SDA
57 | 4. one-vs-rest classifier based on a grid search for parameters for an svm
58 |    classifier
59 | 
60 | To get this you can use::
61 | 
62 |     classifier = ctransforms(
63 |                     chkfinite(),
64 |                     interval_normalise(),
65 |                     featureselector(linear_independent_features),
66 |                     sda_filter(),
67 |                     gridsearch(one_against_one(svm.svm_to_binary(svm.svm_raw())),
68 |                                 params={
69 |                                     'C': 2.**np.arange(-9,5),
70 |                                     'kernel': [svm.rbf_kernel(2.**i) for i in np.arange(-7,4)],
71 |                                 }
72 |                                 ))
73 | 
74 | As you can see, this is very flexible, but can be tedious. Therefore, milk
75 | provides the above as a single function call: ``defaultclassifier()``
76 | 
77 | 
78 | supervised Submodules
79 | ---------------------
80 | 
81 | - defaultclassifier: contains a default "good enough" classifier
82 | - svm: related to SVMs
83 | - adaboost: Adaboost
84 | - randomforest: random forests
85 | - grouped: contains objects to transform single object learners into group
86 |   learners by voting
87 | - multi: transforms binary learners into multi-class learners (1-vs-1 or
88 |   1-vs-rest)
89 | - featureselection: feature selection
90 | - knn: k-nearest neighbours
91 | - tree: decision tree learners
92 | 
93 | 


--------------------------------------------------------------------------------
/get-eigen.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | wget http://bitbucket.org/eigen/eigen/get/3.1.2.tar.bz2
3 | tar xjf 3.1.2.tar.bz2
4 | cd eigen-eigen-5097c01bcdc4
5 | mkdir -p ../milk/supervised/eigen3
6 | cp -r Eigen ../milk/supervised/eigen3
7 | 
8 | 


--------------------------------------------------------------------------------
/milk/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2008-2015, Luis Pedro Coelho <luis@luispedro.org>
 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 4 | # 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | #  of this software and associated documentation files (the "Software"), to deal
 7 | #  in the Software without restriction, including without limitation the rights
 8 | #  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | #  copies of the Software, and to permit persons to whom the Software is
10 | #  furnished to do so, subject to the following conditions:
11 | # 
12 | # The above copyright notice and this permission notice shall be included in
13 | #  all copies or substantial portions of the Software.
14 | # 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | #  THE SOFTWARE.
22 | 
23 | '''
24 | Milk
25 | 
26 | Machine learning in Python
27 | 
28 | Toplevel functions
29 | ------------------
30 | - nfoldcrossvalidation: n-fold crossvalidation
31 | - defaultclassifier: get a general purpose classifier
32 | - kmeans: kmeans clustering
33 | 
34 | Modules
35 | -------
36 | - supervised
37 | - unsupervised
38 | - measures
39 | 
40 | Example
41 | -------
42 | 
43 | ::
44 | 
45 |     features = np.random.randn(100,20)
46 |     features[:50] *= 2
47 |     labels = np.repeat((0,1), 50)
48 | 
49 |     classifier = milk.defaultclassifier()
50 |     model = classifier.train(features, labels)
51 |     new_label = model.apply(np.random.randn(100))
52 |     new_label2 = model.apply(np.random.randn(100)*2)
53 | 
54 | '''
55 | 
56 | try:
57 |     from .nfoldcrossvalidation import nfoldcrossvalidation
58 |     from .supervised.defaultclassifier import defaultclassifier
59 |     from .supervised.defaultlearner import defaultlearner
60 |     from .unsupervised.kmeans import kmeans
61 |     from .unsupervised import pdist, zscore, pca
62 |     from .milk_version import __version__
63 | except ImportError as e:
64 |     import sys
65 |     sys.stderr.write('''\
66 | Could not import submodules (exact error was: {}).
67 | 
68 | There are many reasons for this error the most common one is that you have
69 | either not built the packages or have built (using `python setup.py build`) or
70 | installed them (using `python setup.py install`) and then proceeded to test
71 | milk **without changing the current directory**.
72 | 
73 | Try installing and then changing to another directory before importing milk.
74 | '''.format(e))
75 | 
76 | __all__ = [
77 |     '__version__',
78 |     'kmeans',
79 |     'pdist',
80 |     'zscore',
81 |     'defaultclassifier',
82 |     'defaultlearner',
83 |     'nfoldcrossvalidation',
84 |     ]
85 | 


--------------------------------------------------------------------------------
/milk/active/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/milk/abc2a28b526c199414d42c0a26092938968c3caf/milk/active/__init__.py


--------------------------------------------------------------------------------
/milk/active/eimpact.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2008-2010, Luis Pedro Coelho <luis@luispedro.org>
 3 | # 
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | #  of this software and associated documentation files (the "Software"), to deal
 6 | #  in the Software without restriction, including without limitation the rights
 7 | #  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | #  copies of the Software, and to permit persons to whom the Software is
 9 | #  furnished to do so, subject to the following conditions:
10 | # 
11 | # The above copyright notice and this permission notice shall be included in
12 | #  all copies or substantial portions of the Software.
13 | # 
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | #  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | #  THE SOFTWARE.
21 | 
22 | from __future__ import division
23 | import numpy
24 | from ..supervised import svm
25 | from ..supervised.classifier import ctransforms
26 | 
27 | 
28 | def expected_impacts(D,labels,U):
29 |     '''
30 |     EIs = expected_impacts(D,labels,U)
31 | 
32 |     Compute Expected impact for each element of U
33 | 
34 |     Eis[i]:  P(label[i] == 1) * IMPACT(label[i] == 1) + P(label[i] == 0) * IMPACT(label[i] == 0)
35 |     '''
36 |     assert len(D) == len(labels), 'Nr of labeled examples should match lenght of labels vector'
37 | 
38 |     K = svm.rbf_kernel(20000)
39 |     prob_classifier = ctransforms(svm.svm_raw(kernel=K,C=4),svm.svm_sigmoidal_correction())
40 |     label_classifier = ctransforms(svm.svm_raw(kernel=K,C=4),svm.svm_binary())
41 | 
42 |     prob_classifier.train(D,labels)
43 |     u_probs = prob_classifier(U)
44 |     u_labels = (u_probs > .5)
45 |     impacts = []
46 |     for u,p in zip(U,u_probs):
47 |         print(len(impacts))
48 |         label_classifier.train(numpy.vstack((D,u)),numpy.hstack((labels,[0])))
49 |         u_labels_0 = label_classifier(U)
50 | 
51 |         label_classifier.train(numpy.vstack((D,u)),numpy.hstack((labels,[1])))
52 |         u_labels_1 = label_classifier(U)
53 | 
54 |         e_impact = (1.-p)*(u_labels != u_labels_0).sum() + p*(u_labels != u_labels_1).sum()
55 | 
56 |         impacts.append(e_impact)
57 |     return impacts
58 | 
59 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
60 | 


--------------------------------------------------------------------------------
/milk/active/uncertainty.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2008-2010, Luis Pedro Coelho <luis@luispedro.org>
 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 4 | # 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | #  of this software and associated documentation files (the "Software"), to deal
 7 | #  in the Software without restriction, including without limitation the rights
 8 | #  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | #  copies of the Software, and to permit persons to whom the Software is
10 | #  furnished to do so, subject to the following conditions:
11 | # 
12 | # The above copyright notice and this permission notice shall be included in
13 | #  all copies or substantial portions of the Software.
14 | # 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | #  THE SOFTWARE.
22 | 
23 | '''
24 | Uncertainty
25 | ============
26 | 
27 | Implements uncertainty-based active learning strategies.
28 | 
29 | These are strategies that are based on querying those elements in the pool
30 | which we are most uncertain about
31 | 
32 | Functions
33 | ----------
34 |     * entropy
35 |     * one_minus_max
36 | '''
37 | 
38 | from __future__ import division
39 | import numpy
40 | 
41 | def entropy(model, pool):
42 |     '''
43 |     entropies = entropy(model, pool)
44 | 
45 |     Returns the entropy of each classification output for
46 |     members in the pool.
47 |     '''
48 |     def _entropy(labels, ps):
49 |         H = 0.
50 |         for p in ps:
51 |             if p > 1e-9:
52 |                 H += p * np.log(p)
53 |         return H
54 |     return [_entropy(model.apply(u)) for u in pool]
55 | 
56 | def one_minus_max(model,pool):
57 |     '''
58 |     oneminus = one_minus_max(model,pool)
59 | 
60 |     oneminus[i] = 1 - max_L { P(pool_i == L) }
61 | 
62 |     Returns one minus the probability for the best label guess.
63 |     '''
64 |     def _minus1(labels, ps):
65 |         return 1. - np.max(ps)
66 |     return [_minus1(model.apply(u)) for u in pool]
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/milk/demos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/milk/abc2a28b526c199414d42c0a26092938968c3caf/milk/demos/__init__.py


--------------------------------------------------------------------------------
/milk/demos/adaboost.py:
--------------------------------------------------------------------------------
 1 | import pylab as plt
 2 | import milk.supervised.tree
 3 | import milk.supervised.adaboost
 4 | from milksets import wine
 5 | import milk.supervised.multi
 6 | 
 7 | weak = milk.supervised.tree.stump_learner()
 8 | learner = milk.supervised.adaboost.boost_learner(weak)
 9 | learner = milk.supervised.multi.one_against_one(learner)
10 | 
11 | features, labels = wine.load()
12 | cmat,names,predictions = milk.nfoldcrossvalidation(features,labels, classifier=learner, return_predictions=True)
13 | colors = "rgb"
14 | codes = "xo"
15 | for y,x,r,p in zip(features.T[0], features.T[1], labels, predictions):
16 |     code = codes[int(r == p)]
17 |     plt.plot([y],[x], colors[p]+code)
18 | plt.show()
19 | 
20 | 


--------------------------------------------------------------------------------
/milk/demos/rf_wine_2d.py:
--------------------------------------------------------------------------------
 1 | from milk.supervised import randomforest
 2 | from milk.supervised.multi import one_against_one
 3 | import milk.nfoldcrossvalidation
 4 | import milk.unsupervised
 5 | 
 6 | import pylab
 7 | from milksets import wine
 8 | 
 9 | # Load 'wine' dataset
10 | features, labels = wine.load()
11 | # random forest learner
12 | rf_learner = randomforest.rf_learner()
13 | # rf is a binary learner, so we transform it into a multi-class classifier
14 | learner = one_against_one(rf_learner)
15 | 
16 | # cross validate with this learner and return predictions on left-out elements
17 | cmat,names, preds = milk.nfoldcrossvalidation(features, labels, classifier=learner, return_predictions=1)
18 | 
19 | print('cross-validation accuracy:', cmat.trace()/float(cmat.sum()))
20 | 
21 | # dimensionality reduction for display
22 | x,v = milk.unsupervised.pca(features)
23 | colors = "rgb" # predicted colour
24 | marks = "xo" # whether the prediction was correct
25 | for (y,x),p,r in zip(x[:,:2], preds, labels):
26 |     c = colors[p]
27 |     m = marks[p == r]
28 |     pylab.plot(y,x,c+m)
29 | pylab.show()
30 | 
31 | 


--------------------------------------------------------------------------------
/milk/demos/svm-decision-boundary.py:
--------------------------------------------------------------------------------
 1 | from pylab import *
 2 | import numpy as np
 3 | 
 4 | from milksets.wine import load
 5 | import milk.supervised
 6 | import milk.unsupervised.pca
 7 | import milk.supervised.svm
 8 | 
 9 | features, labels = load()
10 | features = features[labels < 2]
11 | labels = labels[labels < 2]
12 | features,_ = milk.unsupervised.pca(features)
13 | features = features[:,:2]
14 | learner = milk.supervised.svm.svm_raw(kernel=np.dot, C=12)
15 | model = learner.train(features, labels)
16 | w = np.dot(model.svs.T, model.Yw)
17 | b = model.b
18 | x = np.linspace(-.5, .1, 100)
19 | y = -w[0]/w[1]*x + b/w[1]
20 | plot(features[labels == 1][:,0], features[labels == 1][:,1], 'bx')
21 | plot(features[labels == 0][:,0], features[labels == 0][:,1], 'ro')
22 | plot(x,y)
23 | savefig('svm-demo-points.pdf')
24 | 
25 | clf()
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 
33 | learner = milk.supervised.svm.svm_raw(kernel=milk.supervised.svm.rbf_kernel(1.), C=12)
34 | model = learner.train(features, labels)
35 | Y, X = (np.mgrid[:101,:101]-50)/12.5
36 | values = [model.apply((y,x)) for y,x in zip(Y.ravel(),X.ravel())]
37 | values = np.array(values).reshape(Y.shape)
38 | sfeatures = features*12.5
39 | sfeatures += 50
40 | plot(sfeatures[labels == 0][:,0], sfeatures[labels == 0][:,1], 'bo')
41 | plot(sfeatures[labels == 1][:,0], sfeatures[labels == 1][:,1], 'ro')
42 | imshow(values.T)
43 | savefig('svm-demo-boundary.pdf')
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/milk/ext/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | ===============
 3 | Milk Extensions
 4 | ===============
 5 | 
 6 | These are modules whose functionality is not really part of the core
 7 | functionality of milk, but which are useful with it.
 8 | '''
 9 | 
10 | 


--------------------------------------------------------------------------------
/milk/measures/__init__.py:
--------------------------------------------------------------------------------
1 | from .measures import accuracy, waccuracy, zero_one_loss, confusion_matrix, bayesian_significance
2 | 


--------------------------------------------------------------------------------
/milk/measures/cluster_agreement.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2011, Luis Pedro Coelho <luis@luispedro.org>
 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 4 | # License: MIT. See COPYING.MIT file in the milk distribution
 5 | 
 6 | from __future__ import division
 7 | import numpy as np
 8 | 
 9 | def rand_arand_jaccard(recovered, labels):
10 |     '''
11 |     rand, a_rand, jaccard = rand_arand_jaccard(recovered, labels)
12 | 
13 |     Compute Rand, Adjusted Rand, and Jaccard indices
14 | 
15 |     These share most of the computation. Therefore, it is best to compute them
16 |     together even if you are only going to use some.
17 | 
18 |     Parameters
19 |     ----------
20 |     recovered : sequence of int
21 |         The recovered clusters
22 |     labels : sequence of int
23 |         Underlying labels
24 | 
25 |     Returns
26 |     -------
27 |     rand : float
28 |         Rand index
29 |     a_rand : float
30 |         Adjusted Rand index
31 |     jaccard : float
32 |         Jaccard index
33 | 
34 |     References
35 |     ----------
36 |     http://en.wikipedia.org/wiki/Rand_index
37 |     http://en.wikipedia.org/wiki/Jaccard_index
38 |     '''
39 | 
40 |     from scipy.misc import comb
41 |     recovered = np.asanyarray(recovered)
42 |     labels = np.asanyarray(labels)
43 |     contig,_,_ = np.histogram2d(recovered, labels,np.arange(max(recovered.max()+2,labels.max()+2)))
44 |     A_0 = contig.sum(0)
45 |     A_1 = contig.sum(1)
46 |     Ai2 = np.sum(A_0*(A_0-1)/2.)
47 |     Bi2 = np.sum(A_1*(A_1-1)/2.)
48 |     n = A_0.sum()
49 | 
50 |     a = comb(contig.ravel(), 2).sum()
51 |     b = comb(A_0, 2).sum()-a
52 |     c = comb(A_1, 2).sum()-a
53 |     d = comb(n, 2)-a-b-c
54 |     rand = (a+d)/(a+b+c+d)
55 |     jaccard = (a+d)/(b+c+d)
56 | 
57 |     index = np.sum(contig*(contig-1)/2)
58 |     expected = Ai2*Bi2/n/(n-1)*2.
59 |     maxindex = (Ai2+Bi2)/2.
60 |     a_rand = (index-expected)/(maxindex-expected)
61 | 
62 |     return rand, a_rand, jaccard
63 | 
64 | 


--------------------------------------------------------------------------------
/milk/measures/curves.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2011-2013, Luis Pedro Coelho <luis@luispedro.org>
  3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
  4 | # License: MIT. See COPYING.MIT file in the milk distribution
  5 | 
  6 | from __future__ import division
  7 | import numpy as np
  8 | 
  9 | def precision_recall(values, labels, mode='all', nr_steps=100):
 10 |     '''
 11 |     precision, recall = precision_recall(values, labels, mode='all', nr_steps=100)
 12 |     plot(precision, recall)
 13 | 
 14 |     Compute a precision-recall curve.
 15 | 
 16 |     For a given threshold ``T``, consider that the positions where ``values >=
 17 |     T`` are classified as True. Precision is defined as ``TP/(TP+FP)``, while
 18 |     recall is defined as ``TP/(TP+FN)``.
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |     values : sequence of numbers
 23 |     labels : boolean sequence
 24 |     mode : str, optional
 25 |         Which thresholds to consider. Either 'all' (i.e., use all values of
 26 |         `values` as possible thresholds), or 'step' (using `nr_steps`
 27 |         equidistant points from ``min(values)`` to ``max(values)``)
 28 |     nr_steps : integer, optional
 29 |         How many steps to use. Only meaningfule if ``mode == 'steps'``
 30 | 
 31 |     Returns
 32 |     -------
 33 |     precision : a sequence of floats
 34 |     recall : a sequence of floats
 35 | 
 36 |     Actually, ``2 x P`` array is returned.
 37 |     '''
 38 | 
 39 |     values = np.asanyarray(values)
 40 |     labels = np.asanyarray(labels)
 41 |     if len(values) != len(labels):
 42 |         raise ValueError('milk.measures.precision_recall: `values` must be of same length as `labels`')
 43 |     if mode == 'all':
 44 |         points = list(set(values))
 45 |         points.sort()
 46 |     elif mode == 'steps':
 47 |         points = np.linspace(values.min(), values.max(), nr_steps)
 48 |     else:
 49 |         raise ValueError('milk.measures.precision_recall: cannot handle mode: `%s`' % mode)
 50 |     true_pos = float(np.sum(labels))
 51 |     precision_recall = np.empty((len(points),2), np.float)
 52 | 
 53 |     for i,p in enumerate(points):
 54 |         selected = (values >= p)
 55 |         selected = labels[selected]
 56 |         precision_recall[i] = (np.mean(selected), np.sum(selected)/true_pos)
 57 |     return precision_recall.T
 58 | 
 59 | def roc(values, labels, mode='all', nr_steps=100):
 60 |     '''
 61 |     fpr, tpr = roc(values, labels, mode='all', nr_steps=100)
 62 |     plot(fpr, tpr)
 63 | 
 64 |     Compute a ROC curve
 65 | 
 66 |     For a given threshold ``T``, consider that the positions where ``values >=
 67 |     T`` are classified as True. Precision is defined as ``TP/(TP+FP)``, while
 68 |     recall is defined as ``TP/(TP+FN)``.
 69 | 
 70 |     Parameters
 71 |     ----------
 72 |     values : sequence of numbers
 73 |     labels : boolean sequence
 74 |     mode : str, optional
 75 |         Which thresholds to consider. Either 'all' (i.e., use all values of
 76 |         `values` as possible thresholds), or 'step' (using `nr_steps`
 77 |         equidistant points from ``min(values)`` to ``max(values)``)
 78 |     nr_steps : integer, optional
 79 |         How many steps to use. Only meaningfule if ``mode == 'steps'``
 80 | 
 81 |     Returns
 82 |     -------
 83 |     precision : a sequence of floats
 84 |     recall : a sequence of floats
 85 | 
 86 |     Actually, ``2 x P`` array is returned.
 87 |     '''
 88 |     values = np.asanyarray(values)
 89 |     labels = np.asanyarray(labels)
 90 |     if len(values) != len(labels):
 91 |         raise ValueError('milk.measures.roc: `values` must be of same length as `labels`')
 92 |     if mode == 'all':
 93 |         points = list(set(values))
 94 |         points.sort()
 95 |     elif mode == 'steps':
 96 |         points = np.linspace(values.min(), values.max(), nr_steps)
 97 |     else:
 98 |         raise ValueError('milk.measures.roc: cannot handle mode: `%s`' % mode)
 99 |     roc = np.empty((len(points),2), np.float)
100 |     P = float(np.sum(labels))
101 |     N = len(labels)-P
102 |     for i,p in enumerate(reversed(points)):
103 |         selected = labels[values >= p]
104 |         roc[i] = (np.sum(~selected)/N, np.sum(selected)/P)
105 |     return roc.T
106 | 
107 | 


--------------------------------------------------------------------------------
/milk/milk_version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.6.1'
2 | 


--------------------------------------------------------------------------------
/milk/nfoldcrossvalidation.py:
--------------------------------------------------------------------------------
1 | from .measures.nfoldcrossvalidation import foldgenerator, getfold, nfoldcrossvalidation
2 | 


--------------------------------------------------------------------------------
/milk/supervised/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2008-2012, Luis Pedro Coelho <luis@luispedro.org>
 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 4 | #
 5 | # License: MIT. See COPYING.MIT file in the milk distribution
 6 | 
 7 | '''
 8 | milk.supervised
 9 | 
10 | This hold the supervised classification modules:
11 | 
12 | Submodules
13 | ----------
14 | 
15 | - defaultclassifier: contains a default "good enough" classifier
16 | - svm: related to SVMs
17 | - grouped: contains objects to transform single object classifiers into group classifiers
18 |     by voting
19 | - multi: transforms binary classifiers into multi-class classifiers (1-vs-1 or 1-vs-rest)
20 | - featureselection: feature selection
21 | - knn: k-nearest neighbours
22 | - tree: decision tree classifiers
23 | 
24 | Classifiers
25 | -----------
26 | 
27 | All classifiers have a `train` function which takes 2 arguments:
28 |     - features : sequence of features
29 |     - labels : sequence of labels
30 | They return a `model` object, which has an `apply` function which takes a
31 | single input and returns its label.
32 | 
33 | Note that there are always two objects: the learned and the model and they are
34 | independent. Every time you call learner.train() you get a new model.
35 | 
36 | Both classifiers and models are pickle()able.
37 | 
38 | Example
39 | -------
40 | ::
41 | 
42 |     features = np.random.randn(100,20)
43 |     features[:50] *= 2
44 |     labels = np.repeat((0,1), 50)
45 | 
46 |     classifier = milk.defaultclassifier()
47 |     model = classifier.train(features, labels)
48 |     new_label = model.apply(np.random.randn(100))
49 |     new_label2 = model.apply(np.random.randn(100)*2)
50 | '''
51 | 
52 | from .defaultclassifier import defaultclassifier, svm_simple
53 | from .classifier import normaliselabels
54 | from .gridsearch import gridsearch
55 | from .tree import tree_learner
56 | from .lasso import lasso, lasso_learner, lasso_model_walk, lasso_walk
57 | 
58 | __all__ = [
59 |     'normaliselabels',
60 |     'defaultclassifier',
61 |     'svm_simple',
62 |     'gridsearch',
63 |     'lasso',
64 |     'lasso_learner',
65 |     'lasso_model_walk',
66 |     'lasso_walk',
67 |     'tree_learner',
68 |     ]
69 | 


--------------------------------------------------------------------------------
/milk/supervised/_perceptron.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2011-2015, Luis Pedro Coelho <luis@luispedro.org>
 2 | // License: MIT
 3 | 
 4 | #include <iostream>
 5 | #include <memory>
 6 | #include <cmath>
 7 | #include <cassert>
 8 | #include "../utils/utils.h"
 9 | extern "C" {
10 |     #include <Python.h>
11 |     #include <numpy/ndarrayobject.h>
12 | }
13 | 
14 | 
15 | namespace {
16 | 
17 | template <typename T>
18 | int perceptron(PyArrayObject* data_arr, const long* labels, PyArrayObject* weights_arr, double eta) {
19 |     const T* data = reinterpret_cast<T*>(PyArray_DATA(data_arr));
20 |     T* weights = reinterpret_cast<T*>(PyArray_DATA(weights_arr));
21 |     const int N0 = PyArray_DIM(data_arr, 0);
22 |     const int N1 = PyArray_DIM(data_arr, 1);
23 |     int nr_errors = 0;
24 |     for (int i = 0; i != N0; ++i, data += N1, ++labels) {
25 |         T val = weights[0];
26 |         for (int j = 0; j != N1; ++j) {
27 |             val += weights[j+1] * data[j];
28 |         }
29 |         int ell = (val > 0);
30 |         if (ell != *labels) {
31 |             int pm = (*labels ? +1 : -1);
32 |             ++nr_errors;
33 |             T error = pm * eta * std::abs(pm-val);
34 |             weights[0] += error;
35 |             for (int j = 0; j != N1; ++j) {
36 |                 weights[j+1] += error*data[j];
37 |             }
38 |         }
39 |     }
40 |     return nr_errors;
41 | }
42 | 
43 | PyObject* py_perceptron(PyObject* self, PyObject* args) {
44 |     const char* errmsg = "Arguments were not what was expected for perceptron.\n"
45 |                         "This is an internal function: Do not call directly unless you know exactly what you're doing.\n";
46 |     PyArrayObject* data;
47 |     PyArrayObject* labels;
48 |     PyArrayObject* weights;
49 |     double eta;
50 |     if (!PyArg_ParseTuple(args, "OOOd", &data, &labels, &weights, &eta)) {
51 |         PyErr_SetString(PyExc_RuntimeError,errmsg);
52 |         return 0;
53 |     }
54 |     if (!PyArray_Check(data) || !PyArray_ISCONTIGUOUS(data) ||
55 |         !PyArray_Check(weights) || !PyArray_ISCONTIGUOUS(weights) ||
56 |         !PyArray_Check(labels) || !PyArray_ISCONTIGUOUS(labels) || !PyArray_EquivTypenums(PyArray_TYPE(labels), NPY_LONG) ||
57 |         PyArray_TYPE(data) != PyArray_TYPE(weights)||
58 |         PyArray_NDIM(data) != 2 || PyArray_NDIM(weights) != 1 || PyArray_DIM(data,1) + 1 != PyArray_DIM(weights,0)) {
59 |         PyErr_SetString(PyExc_RuntimeError,errmsg);
60 |         return 0;
61 |     }
62 |     int nr_errors;
63 |     if (PyArray_TYPE(data) == NPY_FLOAT) {
64 |         nr_errors = perceptron<float>(data, reinterpret_cast<const long*>(PyArray_DATA(labels)), weights, eta);
65 |     } else if (PyArray_TYPE(data) == NPY_DOUBLE) {
66 |         nr_errors = perceptron<double>(data, reinterpret_cast<const long*>(PyArray_DATA(labels)), weights, eta);
67 |     } else {
68 |         PyErr_SetString(PyExc_RuntimeError, errmsg);
69 |         return 0;
70 |     }
71 |     return PyLong_FromLong(nr_errors);
72 | }
73 | 
74 | PyMethodDef methods[] = {
75 |   {"perceptron", py_perceptron, METH_VARARGS , "Do NOT call directly.\n" },
76 |   {NULL, NULL,0,NULL},
77 | };
78 | 
79 | const char  * module_doc =
80 |     "Internal Module.\n"
81 |     "\n"
82 |     "Do NOT use directly!\n";
83 | 
84 | } // namespace
85 | 
86 | DECLARE_MODULE(_perceptron)
87 | 
88 | 


--------------------------------------------------------------------------------
/milk/supervised/adaboost.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2008-2012, Luis Pedro Coelho <luis@luispedro.org>
 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 4 | # License: MIT. See COPYING.MIT file in the milk distribution
 5 | 
 6 | from __future__ import division
 7 | import numpy as np
 8 | from .normalise import normaliselabels
 9 | from .base import supervised_model
10 | 
11 | '''
12 | AdaBoost
13 | 
14 | Simple implementation of Adaboost
15 | 
16 | Learner
17 | -------
18 | 
19 | boost_learner
20 | 
21 | '''
22 | 
23 | __all__ = [
24 |     'boost_learner',
25 |     ]
26 | 
27 | def _adaboost(features, labels, base, max_iters):
28 |     m = len(features)
29 |     D = np.ones(m, dtype=float)
30 |     D /= m
31 |     Y = np.ones(len(labels), dtype=float)
32 |     names = np.array([-1, +1])
33 |     Y = names[labels]
34 |     H = []
35 |     A = []
36 |     for t in range(max_iters):
37 |         Ht = base.train(features, labels, weights=D)
38 |         train_out = np.array(list(map(Ht.apply, features)))
39 |         train_out = names[train_out.astype(int)]
40 |         Et = np.dot(D, (Y != train_out))
41 |         if Et > .5:
42 |             # early return
43 |             break
44 |         At = .5 * np.log((1. + Et) / (1. - Et))
45 |         D *= np.exp((-At) * Y * train_out)
46 |         D /= np.sum(D)
47 |         A.append(At)
48 |         H.append(Ht)
49 |     return H, A
50 | 
51 | 
52 | class boost_model(supervised_model):
53 |     def __init__(self, H, A, names):
54 |         self.H = H
55 |         self.A = A
56 |         self.names = names
57 | 
58 |     def apply(self, f):
59 |         v = sum((a*h.apply(f)) for h,a in zip(self.H, self.A))
60 |         v /= np.sum(self.A)
61 |         return self.names[v > .5]
62 | 
63 | 
64 | class boost_learner(object):
65 |     '''
66 |     learner = boost_learner(weak_learner_type(), max_iters=100)
67 |     model = learner.train(features, labels)
68 |     test = model.apply(f)
69 | 
70 |     AdaBoost learner
71 | 
72 |     Attributes
73 |     ----------
74 |     base : learner
75 |         Weak learner
76 |     max_iters : integer
77 |         Nr of iterations (default: 100)
78 |     '''
79 |     def __init__(self, base, max_iters=100):
80 |         self.base = base
81 |         self.max_iters = max_iters
82 | 
83 |     def train(self, features, labels, normalisedlabels=False, names=(0,1), weights=None, **kwargs):
84 |         if not normalisedlabels:
85 |             labels,names = normaliselabels(labels)
86 |         H,A = _adaboost(features, labels, self.base, self.max_iters)
87 |         return boost_model(H, A, names)
88 | 


--------------------------------------------------------------------------------
/milk/supervised/base.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2011, Luis Pedro Coelho <luis@luispedro.org>
 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 4 | # License: MIT. See COPYING.MIT file in the milk distribution
 5 | 
 6 | from __future__ import division
 7 | 
 8 | class supervised_model(object):
 9 |     def apply_many(self, fs):
10 |         '''
11 |         labels = model.apply_many( examples )
12 | 
13 |         This is equivalent to ``map(model.apply, examples)`` but may be
14 |         implemented in a faster way.
15 | 
16 |         Parameters
17 |         ----------
18 |         examples : sequence of training examples
19 | 
20 |         Returns
21 |         -------
22 |         labels : sequence of labels
23 |         '''
24 |         return list(map(self.apply, fs))
25 | 
26 | 
27 | class base_adaptor(object):
28 |     def __init__(self, base):
29 |         self.base = base
30 | 
31 |     def set_option(self, k, v):
32 |         self.base.set_option(k, v)
33 | 


--------------------------------------------------------------------------------
/milk/supervised/classifier.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2008-2015, Luis Pedro Coelho <luis@luispedro.org>
  3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | #  of this software and associated documentation files (the "Software"), to deal
  7 | #  in the Software without restriction, including without limitation the rights
  8 | #  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | #  copies of the Software, and to permit persons to whom the Software is
 10 | #  furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | #  all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | #  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | #  THE SOFTWARE.
 22 | 
 23 | from __future__ import division
 24 | import numpy as np
 25 | from .normalise import normaliselabels
 26 | from .base import supervised_model
 27 | 
 28 | __all__ = ['normaliselabels', 'ctransforms']
 29 | 
 30 | class threshold_model(object):
 31 |     '''
 32 |     threshold_model
 33 | 
 34 |     Attributes
 35 |     ----------
 36 |     threshold : float
 37 |         threshold value
 38 |     '''
 39 |     def __init__(self, threshold=.5):
 40 |         self.threshold = .5
 41 | 
 42 |     def apply(self, f):
 43 |         return f >= self.threshold
 44 | 
 45 |     def __repr__(self):
 46 |         return 'threshold_model({})'.format(self.threshold)
 47 |     __str__ = __repr__
 48 | 
 49 | class fixed_threshold_learner(object):
 50 |     def __init__(self, threshold=.5):
 51 |         self.threshold = threshold
 52 |     def train(self, features, labels, **kwargs):
 53 |         return threshold_model(self.threshold)
 54 | 
 55 |     def __repr__(self):
 56 |         return 'fixed_threshold_learner({})'.format(self.threshold)
 57 |     __str__ = __repr__
 58 | 
 59 | 
 60 | class ctransforms_model(supervised_model):
 61 |     '''
 62 |     model = ctransforms_model(models)
 63 | 
 64 |     A model that consists of a series of transformations.
 65 | 
 66 |     See Also
 67 |     --------
 68 |       ctransforms
 69 |     '''
 70 |     def __init__(self, models):
 71 |         self.models = models
 72 | 
 73 |     def apply_many(self, features):
 74 |         if len(features) == 0:
 75 |             return features
 76 |         for m in self.models:
 77 |             features = m.apply_many(features)
 78 |         return features
 79 | 
 80 |     def __repr__(self):
 81 |         return 'ctransforms_model({})'.format(self.models)
 82 |     __str__ = __repr__
 83 | 
 84 |     def __getitem__(self, ix):
 85 |         return self.models[ix]
 86 | 
 87 |     def apply(self,features):
 88 |         for T in self.models:
 89 |             features = T.apply(features)
 90 |         return features
 91 | 
 92 | class ctransforms(object):
 93 |     '''
 94 |     ctransf = ctransforms(c0, c1, c2, ...)
 95 | 
 96 |     Concatenate transforms.
 97 |     '''
 98 |     def __init__(self,*args):
 99 |         self.transforms = args
100 | 
101 | 
102 |     def train(self, features, labels, **kwargs):
103 |         models = []
104 |         model = None
105 |         for T in self.transforms:
106 |             if model is not None:
107 |                 features = np.array([model.apply(f) for f in features])
108 |             model = T.train(features, labels, **kwargs)
109 |             models.append(model)
110 |         return ctransforms_model(models)
111 | 
112 |     def __repr__(self):
113 |         return 'ctransforms(*{})'.format(self.transforms)
114 | 
115 |     __str__ = __repr__
116 | 
117 |     def set_option(self, opt, val):
118 |         idx, opt = opt
119 |         self.transforms[idx].set_option(opt,val)
120 | 
121 | 


--------------------------------------------------------------------------------
/milk/supervised/defaultclassifier.py:
--------------------------------------------------------------------------------
1 | from milk.supervised.defaultlearner import *
2 | defaultclassifier = defaultlearner
3 | 
4 | 


--------------------------------------------------------------------------------
/milk/supervised/grouped.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2010-2011, Luis Pedro Coelho <luis@luispedro.org>
  3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
  4 | #
  5 | # License: MIT. See COPYING.MIT file in the milk distribution
  6 | # -*- coding: utf-8 -*-
  7 | 
  8 | from __future__ import division
  9 | import numpy as np
 10 | from collections import defaultdict
 11 | from .classifier import normaliselabels
 12 | from .base import base_adaptor, supervised_model
 13 | 
 14 | __all__ = [
 15 |     'voting_learner',
 16 |     'mean_learner',
 17 |     'remove_outliers',
 18 |     'filter_outliers',
 19 |     ]
 20 | 
 21 | def _concatenate_features_labels(gfeatures, glabels):
 22 |         if type(gfeatures) == np.ndarray and gfeatures.dtype == object:
 23 |             gfeatures = list(gfeatures)
 24 |         features = np.concatenate(gfeatures)
 25 |         labels = []
 26 |         for feats,label in zip(gfeatures, glabels):
 27 |             labels.extend( [label] * len(feats) )
 28 |         return features, labels
 29 | 
 30 | class voting_learner(base_adaptor):
 31 |     '''
 32 |     Implements a voting scheme for multiple sub-examples per example.
 33 | 
 34 |     classifier = voting_learner(base)
 35 | 
 36 |     base should be a binary classifier
 37 | 
 38 |     Example
 39 |     -------
 40 | 
 41 |     ::
 42 | 
 43 |         voterlearn = voting_learner(milk.supervised.simple_svm())
 44 |         voter = voterlearn.train(training_groups,  labeled_groups)
 45 |         res = voter.apply([ [f0, f1, f3] ])
 46 | 
 47 |     '''
 48 | 
 49 |     def train(self, gfeatures, glabels, normalisedlabels=False):
 50 |         features, labels = _concatenate_features_labels(gfeatures, glabels)
 51 |         return voting_model(self.base.train(features, labels))
 52 | voting_classifier = voting_learner
 53 | 
 54 | 
 55 | class voting_model(supervised_model):
 56 |     def __init__(self, base):
 57 |         self.base = base
 58 | 
 59 |     def apply(self, gfeatures):
 60 |         votes = defaultdict(int)
 61 |         for feats in gfeatures:
 62 |             votes[self.base.apply(feats)] += 1
 63 |         best = None
 64 |         most_votes = 0
 65 |         for k,v in votes.items():
 66 |             if v > most_votes:
 67 |                 best = k
 68 |                 most_votes = v
 69 |         return best
 70 | 
 71 | class mean_learner(base_adaptor):
 72 |     '''
 73 |     Implements a mean scheme for multiple sub-examples per example.
 74 | 
 75 |     classifier = mean_learner(base)
 76 | 
 77 |     `base` should be a classifier that returns a numeric confidence value
 78 |     `classifier` will return the **mean**
 79 | 
 80 |     Example
 81 |     -------
 82 | 
 83 |     ::
 84 | 
 85 |         meanlearner = mean_learner(milk.supervised.raw_svm())
 86 |         model = meanlearner.train(training_groups,  labeled_groups)
 87 |         res = model.apply([ [f0, f1, f3] ])
 88 | 
 89 |     '''
 90 |     def train(self, gfeatures, glabels, normalisedlabels=False):
 91 |         features, labels = _concatenate_features_labels(gfeatures, glabels)
 92 |         return mean_model(self.base.train(features, labels))
 93 | 
 94 | mean_classifier = mean_learner
 95 | 
 96 | class mean_model(supervised_model):
 97 |     def __init__(self, base):
 98 |         self.base = base
 99 | 
100 |     def apply(self, gfeatures):
101 |         return np.mean([self.base.apply(feats) for feats in gfeatures])
102 | 
103 | 
104 | def remove_outliers(features, limit, min_size):
105 |     '''
106 |     features = remove_outliers(features, limit, min_size)
107 | 
108 |     '''
109 |     nsize = int(limit * len(features))
110 |     if nsize < min_size:
111 |         return features
112 | 
113 |     normed = features - features.mean(0)
114 |     std = normed.std(0)
115 |     std[std == 0] = 1
116 |     normed /= std
117 |     f2_sum1 = (normed**2).mean(1)
118 |     values = f2_sum1.copy()
119 |     values.sort()
120 |     top = values[nsize]
121 |     selected = f2_sum1 < top
122 |     return features[selected]
123 | 
124 | 
125 | class filter_outliers_model(supervised_model):
126 |     def __init__(self, limit, min_size):
127 |         self.limit = limit
128 |         self.min_size = min_size
129 | 
130 |     def apply(self, features):
131 |         return remove_outliers(features, self.limit, self.min_size)
132 | 
133 | class filter_outliers(object):
134 |     def __init__(self, limit=.9, min_size=3):
135 |         self.limit = limit
136 |         self.min_size = min_size
137 | 
138 |     def train(self, features, labels, normalisedlabels=False):
139 |         return filter_outliers_model(self.limit, self.min_size)
140 | 
141 | 


--------------------------------------------------------------------------------
/milk/supervised/knn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2008-2012, Luis Pedro Coelho <luis@luispedro.org>
  3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
  4 | #
  5 | # License: MIT. See COPYING.MIT file in the milk distribution
  6 | 
  7 | from __future__ import division
  8 | from collections import defaultdict
  9 | from milk.utils import get_nprandom
 10 | import numpy as np
 11 | from .base import supervised_model
 12 | 
 13 | __all__ = [
 14 |     'kNN',
 15 |     'knn_learner',
 16 |     'approximate_knn_learner',
 17 |     ]
 18 | 
 19 | def _plurality(xs):
 20 |     from collections import defaultdict
 21 |     counts = defaultdict(int)
 22 |     for x in xs: counts[x] += 1
 23 |     best,_ = max(iter(counts.items()), key=(lambda k_v: k_v[1]))
 24 |     return best
 25 | 
 26 | class kNN(object):
 27 |     '''
 28 |     k-Nearest Neighbour Classifier
 29 | 
 30 |     Naive implementation of a k-nearest neighbour classifier.
 31 | 
 32 |     C = kNN(k)
 33 | 
 34 |     Attributes:
 35 |     -----------
 36 |     k : integer
 37 |         number of neighbours to consider
 38 |     '''
 39 | 
 40 | 
 41 |     def __init__(self, k=1):
 42 |         self.k = k
 43 | 
 44 |     def train(self, features, labels, normalisedlabels=False, copy_features=False):
 45 |         features = np.asanyarray(features)
 46 |         labels = np.asanyarray(labels)
 47 |         if copy_features:
 48 |             features = features.copy()
 49 |             labels = labels.copy()
 50 |         features2 = np.sum(features**2, axis=1)
 51 |         return kNN_model(self.k, features, features2, labels)
 52 | 
 53 | knn_learner = kNN
 54 | 
 55 | class kNN_model(supervised_model):
 56 |     def __init__(self, k, features, features2, labels):
 57 |         self.k = k
 58 |         self.features = features
 59 |         self.f2 = features2
 60 |         self.labels = labels
 61 | 
 62 |     def apply(self, features):
 63 |         features = np.asanyarray(features)
 64 |         diff2 = np.dot(self.features, (-2.)*features)
 65 |         diff2 += self.f2
 66 |         neighbours = diff2.argsort()[:self.k]
 67 |         labels = self.labels[neighbours]
 68 |         return _plurality(labels)
 69 | 
 70 | 
 71 | class approximate_knn_model(supervised_model):
 72 |     def __init__(self, k, X, projected):
 73 |         self.k = k
 74 |         self.X = X
 75 |         self.projected = projected
 76 |         self.p2 = np.array([np.dot(p,p) for p in projected])
 77 | 
 78 |     def apply(self, t):
 79 |         tx = np.dot(self.X.T, t)
 80 |         d = np.dot(self.projected,tx)
 81 |         d *= -2
 82 |         d += self.p2
 83 |         if self.k == 1:
 84 |             return np.array([d.argmin()])
 85 |         d = d.argsort()
 86 |         return d[:self.k]
 87 | 
 88 | class approximate_knn_classification_model(supervised_model):
 89 |     def __init__(self, k, X, projected, labels):
 90 |         self.base = approximate_knn_model(k, X, projected)
 91 |         self.labels = labels
 92 | 
 93 |     def apply(self, f):
 94 |         idxs = self.base.apply(f)
 95 |         return _plurality(self.labels[idxs])
 96 | 
 97 | class approximate_knn_learner(object):
 98 |     '''
 99 |     approximate_knn_learner
100 | 
101 |     Learns a k-nearest neighbour classifier, where the proximity is approximate
102 |     as it is computed on a small dimensional subspace (random subspace
103 |     projection). For many datasets, this is acceptable.
104 |     '''
105 | 
106 |     def __init__(self, k, ndims=8):
107 |         self.k = k
108 |         self.ndims = ndims
109 |     def train(self, features, labels, **kwargs):
110 |         labels = np.asanyarray(labels)
111 |         R = get_nprandom(kwargs.get('R'))
112 |         _, n_features = features.shape
113 |         X = R.random_sample((n_features, self.ndims))
114 |         projected = np.dot(features, X)
115 |         return approximate_knn_classification_model(self.k, X, projected, labels.copy())
116 | 
117 | 


--------------------------------------------------------------------------------
/milk/supervised/logistic.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2008-2011, Luis Pedro Coelho <luis@luispedro.org>
  3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
  4 | # License: MIT. See COPYING.MIT file in the milk distribution
  5 | 
  6 | from __future__ import division
  7 | import numpy as np
  8 | from .normalise import normaliselabels
  9 | from .base import supervised_model
 10 | 
 11 | __all__ = [
 12 |     'logistic_learner',
 13 |     ]
 14 | 
 15 | @np.vectorize
 16 | def _sigmoidal(z):
 17 |     if (z > 300): return 1.
 18 |     if z < -300: return 0.
 19 |     return 1./(1+np.exp(-z))
 20 | 
 21 | class logistic_model(supervised_model):
 22 |     def __init__(self, bs):
 23 |         self.bs = bs
 24 | 
 25 |     def apply(self, fs):
 26 |         return _sigmoidal(self.bs[0] + np.dot(fs, self.bs[1:]))
 27 | 
 28 | class logistic_learner(object):
 29 |     '''
 30 |     learner = logistic_learner(alpha=0.0)
 31 | 
 32 |     Logistic regression learner
 33 | 
 34 |     There are two implementations:
 35 | 
 36 |     1. One which depends on ``scipy.optimize``. This is the default and is
 37 |        extremely fast.
 38 |     2. If ``import scipy`` fails, then we fall back to a Python only
 39 |        gradient-descent. This gives good results, but is many times slower.
 40 | 
 41 |     Properties
 42 |     ----------
 43 | 
 44 |     alpha : real, optional
 45 |         penalty for L2-normalisation. Default is zero, for no penalty.
 46 | 
 47 |     '''
 48 |     def __init__(self, alpha=0.0):
 49 |         self.alpha = alpha
 50 | 
 51 |     def train(self, features, labels, normalisedlabels=False, names=None, **kwargs):
 52 |         def error(bs):
 53 |             response = bs[0] + np.dot(features, bs[1:])
 54 |             response = _sigmoidal(response)
 55 |             diff = response - labels
 56 |             log_like = np.dot(diff, diff)
 57 |             L2_penalty = self.alpha * np.dot(bs, bs)
 58 |             return log_like + L2_penalty
 59 |         def error_prime(bs):
 60 |             fB = np.dot(features, bs[1:])
 61 |             response = _sigmoidal(bs[0] + fB)
 62 |             sprime = response * (1-response)
 63 |             ds = (response - labels) * sprime
 64 |             b0p = np.sum(ds)
 65 |             b1p = np.dot(features.T, ds)
 66 |             bp = np.concatenate( ([b0p], b1p) )
 67 |             return 2.*(bp + self.alpha*bs)
 68 | 
 69 |         features = np.asanyarray(features)
 70 |         if not normalisedlabels:
 71 |             labels, _ = normaliselabels(labels)
 72 |         N,f = features.shape
 73 |         bs = np.zeros(f+1)
 74 |         try:
 75 |             from scipy import optimize
 76 |             # Some testing revealed that this was a good combination
 77 |             # call fmin_cg twice first and then fmin
 78 |             # I do not understand why 100%, but there it is
 79 |             bs = optimize.fmin_cg(error, bs, error_prime, disp=False)
 80 |             bs = optimize.fmin_cg(error, bs, error_prime, disp=False)
 81 |             bs = optimize.fmin(error, bs, disp=False)
 82 |         except ImportError:
 83 |             import warnings
 84 |             warnings.warn('''\
 85 | milk.supervised.logistic.train: Could not import scipy.optimize.
 86 | Fall back to very simple gradient descent (which is slow).''')
 87 |             bs = np.zeros(f+1)
 88 |             cur = 1.e-6
 89 |             ebs = error(bs)
 90 |             for i in range(1000000):
 91 |                 dir = error_prime(bs)
 92 |                 step = (lambda e : bs - e *dir)
 93 |                 enbs = ebs + 1
 94 |                 while enbs > ebs:
 95 |                     cur /= 2.
 96 |                     if cur == 0.:
 97 |                         break
 98 |                     nbs = step(cur)
 99 |                     enbs = error(nbs)
100 |                 while cur < 10.:
101 |                     cur *= 2
102 |                     nnbs = step(cur)
103 |                     ennbs = error(nnbs)
104 |                     if ennbs < enbs:
105 |                         nbs = nnbs
106 |                         enbs = ennbs
107 |                     else:
108 |                         break
109 |                 bs = nbs
110 |                 ebs = enbs
111 |         return logistic_model(bs)
112 | 


--------------------------------------------------------------------------------
/milk/supervised/multi_label.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2011-2015, Luis Pedro Coelho <luis@luispedro.org>
 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 4 | # License: MIT. See COPYING.MIT file in the milk distribution
 5 | 
 6 | from __future__ import division
 7 | import numpy as np
 8 | from .base import supervised_model, base_adaptor
 9 | 
10 | class one_by_one_model(supervised_model):
11 |     def __init__(self, models):
12 |         self.models = models
13 | 
14 |     def apply(self, fs):
15 |         result = []
16 |         for ell,model in self.models.items():
17 |             if model.apply(fs):
18 |                 result.append(ell)
19 |         return result
20 | 
21 | 
22 | class one_by_one(base_adaptor):
23 |     '''
24 |     Implements 1-vs-all multi-label classifier by transforming a base (binary)
25 |     classifier.
26 | 
27 |     Example
28 |     -------
29 | 
30 |     features = [....]
31 |     labels = [
32 |         (0,),
33 |         (1,2),
34 |         (0,2),
35 |         (0,3),
36 |         (1,2,3),
37 |         (2,0),
38 |         ...
39 |         ]
40 |     learner = one_by_one(milk.defaultlearner())
41 |     model = learner.train(features, labels)
42 |     '''
43 |     def train(self, features, labels, **kwargs):
44 |         universe = set()
45 |         for ls in labels:
46 |             universe.update(ls)
47 |         models = {}
48 |         for ell in universe:
49 |             contained = np.array([int(ell in ls) for ls in labels])
50 |             models[ell] = self.base.train(features, contained, normalisedlabels=True)
51 |         return one_by_one_model(models)
52 | 
53 |     def __str__(self):
54 |         return 'one_by_one({})'.format(self.base)
55 | 


--------------------------------------------------------------------------------
/milk/supervised/multi_view.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2008-2011, Luis Pedro Coelho <luis@luispedro.org>
 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 4 | #
 5 | # License: MIT. See COPYING.MIT file in the milk distribution
 6 | 
 7 | import numpy as np
 8 | 
 9 | 
10 | __all__ = [
11 |     'multi_view_learner',
12 |     ]
13 | class multi_view_model(object):
14 |     def __init__(self, models):
15 |         self.models = models
16 | 
17 |     def apply(self, features):
18 |         if len(features) != len(self.models):
19 |             raise ValueError('milk.supervised.two_view: Nr of features does not match training data (got %s, expected %s)' % (len(features) ,len(self.models)))
20 |         Ps = np.array([model.apply(f) for model,f in zip(self.models, features)])
21 |         if np.any(Ps <= 0.): return False
22 |         if np.any(Ps >= 1.): return True
23 |         # This is binary only:
24 |         # if \prod Pi > \prod (1-Pi) return 1
25 |         # is equivalent to
26 |         # if \prod Pi/(1-Pi) > 1. return 1
27 |         # if \sum \log( Pi/(1-Pi) ) > 0. return 1
28 |         return np.sum( np.log(Ps/(1-Ps)) ) > 0
29 | 
30 | 
31 | class multi_view_learner(object):
32 |     '''
33 |     Multi View Learner
34 | 
35 |     This learner learns different classifiers on multiple sets of features and
36 |     combines them for classification.
37 | 
38 |     '''
39 |     def __init__(self, bases):
40 |         self.bases = bases
41 | 
42 |     def train(self, features, labels, normalisedlabels=False):
43 |         features = list(zip(*features))
44 |         if len(features) != len(self.bases):
45 |             raise ValueError('milk.supervised.multi_view_learner: ' +
46 |                         'Nr of features does not match classifiser construction (got %s, expected %s)'
47 |                         % (len(features) ,len(self.bases)))
48 |         models = []
49 |         for basis,f in zip(self.bases, features):
50 |             try:
51 |                 f = np.array(f)
52 |             except:
53 |                 f = np.array(f, dtype=object)
54 |             models.append(basis.train(f, labels))
55 |         return multi_view_model(models)
56 | 
57 | multi_view_classifier = multi_view_learner
58 | 


--------------------------------------------------------------------------------
/milk/supervised/normalise.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2008-2012, Luis Pedro Coelho <luis@luispedro.org>
  3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
  4 | #
  5 | # License: MIT. See COPYING.MIT file in the milk distribution
  6 | 
  7 | from __future__ import division
  8 | import numpy as np
  9 | from .base import supervised_model
 10 | from ..unsupervised.normalise import zscore
 11 | 
 12 | __all__ = [
 13 |     'zscore',
 14 |     'zscore_normalise',
 15 |     'interval_normalise',
 16 |     'chkfinite',
 17 |     'sample_to_2min',
 18 |     'normaliselabels'
 19 | ]
 20 | 
 21 | 
 22 | class subtract_divide_model(supervised_model):
 23 |     def __init__(self, shift, factor):
 24 |         factor[factor == 0] = 1 # This makes the division a null op.
 25 | 
 26 |         self.shift = shift
 27 |         self.factor = factor
 28 | 
 29 |     def apply_many(self, features):
 30 |         if len(features) == 0:
 31 |             return features
 32 |         return (features - self.shift)/self.factor
 33 | 
 34 |     def apply(self, features):
 35 |         return (features - self.shift)/self.factor
 36 | 
 37 |     def __repr__(self):
 38 |         return 'subtract_divide_model(%s, %s)' % (self.shift, self.factor)
 39 | 
 40 | class zscore_normalise(object):
 41 |     '''
 42 |     Normalise to z-scores
 43 | 
 44 |     A preprocessor that normalises features to z scores.
 45 |     '''
 46 | 
 47 |     def train(self, features, labels, **kwargs):
 48 |         shift = features.mean(0)
 49 |         factor = np.std(features,0)
 50 |         return subtract_divide_model(shift, factor)
 51 | 
 52 | class interval_normalise(object):
 53 |     '''
 54 |     Linearly scale to the interval [-1,1] (per libsvm recommendation)
 55 | 
 56 |     '''
 57 |     def train(self, features, labels, **kwargs):
 58 |         ptp = features.ptp(0)
 59 |         shift = features.min(0) + ptp/2.
 60 |         factor = ptp/2.
 61 |         return subtract_divide_model(shift, factor)
 62 | 
 63 |     def __repr__(self):
 64 |         return 'interval_normalise()'
 65 | 
 66 | 
 67 | def sample_to_2min(labels):
 68 |     '''
 69 |     selected = sample_to_2min(labels)
 70 | 
 71 |     Select examples so that the ratio of size of the largest
 72 |     class to the smallest class is at most two (i.e.,
 73 |         min_label_count = min { (labels == L).sum() | for L in set(labels) }
 74 |         for L' in set(labels):
 75 |             assert (labels == L').sum() <= 2 * min_label_count
 76 |     )
 77 | 
 78 |     Parameters
 79 |     ----------
 80 |     labels : sequence of labels
 81 | 
 82 |     Returns
 83 |     -------
 84 |     selected : a Boolean numpy.ndarray
 85 |     '''
 86 |     from collections import defaultdict
 87 |     counts = defaultdict(int)
 88 |     for n in labels:
 89 |         counts[n] += 1
 90 | 
 91 |     labels = np.asanyarray(labels)
 92 |     max_entries = np.min(list(counts.values()))*2
 93 |     selected = np.zeros(len(labels), bool)
 94 |     for c in counts.keys():
 95 |         p, = np.where(labels == c)
 96 |         p = p[:max_entries]
 97 |         selected[p] = 1
 98 |     return selected
 99 | 
100 | 
101 | 
102 | class chkfinite(supervised_model):
103 |     '''
104 |     Fill NaN & Inf values
105 | 
106 |     Replaces NaN & Inf values with zeros.
107 |     '''
108 |     def __init__(self):
109 |         pass
110 | 
111 |     def train(self, features, labels, **kwargs):
112 |         return self
113 | 
114 |     def apply(self, features):
115 |         nans = np.isnan(features) + np.isinf(features)
116 |         if nans.any():
117 |             features = features.copy()
118 |             features[nans] = 0
119 |         return features
120 | 
121 |     def __repr__(self):
122 |         return 'chkfinite()'
123 | 
124 | def normaliselabels(labels, multi_label=False):
125 |     '''
126 |     normalised, names = normaliselabels(labels, multi_label=False)
127 | 
128 |     If not ``multi_label`` (the default), normalises the labels to be integers
129 |     from 0 through N-1. Otherwise, assume that each label is actually a
130 |     sequence of labels.
131 | 
132 |     ``normalised`` is a np.array, while ``names`` is a list mapping the indices to
133 |     the old names.
134 | 
135 |     Parameters
136 |     ----------
137 |     labels : any iterable of labels
138 |     multi_label : bool, optional
139 |         Whether labels are actually composed of multiple labels
140 | 
141 |     Returns
142 |     ------
143 |     normalised : a numpy ndarray
144 |         If not ``multi_label``, this is an array of integers 0 .. N-1;
145 |         otherwise, it is a boolean array of size len(labels) x N
146 |     names : list of label names
147 |     '''
148 |     if multi_label:
149 |         names = set()
150 |         for ell in labels: names.update(ell)
151 |         names = list(sorted(names))
152 |         normalised = np.zeros( (len(labels), len(names)), bool)
153 |         for i,ls in enumerate(labels):
154 |             for ell in map(names.index, ls):
155 |                 normalised[i,ell] = True
156 |         return normalised, names
157 |     else:
158 |         names = sorted(set(labels))
159 |         normalised = list(map(names.index, labels))
160 |         return np.array(normalised), names
161 | 
162 | 


--------------------------------------------------------------------------------
/milk/supervised/parzen.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2008-2011, Luis Pedro Coelho <luis@luispedro.org>
 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 4 | #
 5 | # License: MIT. See COPYING.MIT file in the milk distribution
 6 | 
 7 | from __future__ import division
 8 | import numpy as np
 9 | 
10 | def get_parzen_rbf_loocv(features,labels):
11 |     xij = np.dot(features,features.T)
12 |     f2 = np.sum(features**2,1)
13 |     d = f2-2*xij
14 |     d = d.T + f2
15 |     d_argsorted = d.argsort(1)
16 |     d_sorted = d.copy()
17 |     d_sorted.sort(1)
18 |     e_d = np.exp(-d_sorted)
19 |     labels_sorted = labels[d_argsorted].astype(np.double)
20 |     labels_sorted *= 2
21 |     labels_sorted -= 1
22 |     def f(sigma):
23 |         k = e_d ** (1./sigma)
24 |         return (((k[:,1:] * labels_sorted[:,1:]).sum(1) > 0) == labels).mean()
25 |     return f
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/milk/supervised/perceptron.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2011, Luis Pedro Coelho <luis@luispedro.org>
 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 4 | #
 5 | # License: MIT. See COPYING.MIT file in the milk distribution
 6 | 
 7 | import numpy as np
 8 | from .classifier import normaliselabels
 9 | from .base import supervised_model
10 | from . import _perceptron
11 | 
12 | class perceptron_model(supervised_model):
13 |     def __init__(self, w):
14 |         self.w = w
15 | 
16 |     def apply(self, f):
17 |         f = np.asanyarray(f)
18 |         v = self.w[0] + np.dot(f, self.w[1:])
19 |         return v > 0
20 | 
21 | class perceptron_learner(object):
22 |     def __init__(self, eta=.1, max_iters=128):
23 |         self.eta = eta
24 |         self.max_iters = max_iters
25 | 
26 |     def train(self, features, labels, normalisedlabels=False, **kwargs):
27 |         if not normalisedlabels:
28 |             labels, _ = normaliselabels(labels)
29 |         features = np.asanyarray(features)
30 |         if features.dtype not in (np.float32, np.float64):
31 |             features = features.astype(np.float64)
32 |         weights = np.zeros(features.shape[1]+1, features.dtype)
33 |         for i in range(self.max_iters):
34 |             errors = _perceptron.perceptron(features, labels, weights, self.eta)
35 |             if not errors:
36 |                 break
37 |         return perceptron_model(weights)
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/milk/supervised/precluster_learner.py:
--------------------------------------------------------------------------------
1 | from .precluster import *
2 | 


--------------------------------------------------------------------------------
/milk/supervised/randomforest.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2010-2011, Luis Pedro Coelho <luis@luispedro.org>
  3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
  4 | #
  5 | # License: MIT. See COPYING.MIT file in the milk distribution
  6 | 
  7 | '''
  8 | Random Forest
  9 | -------------
 10 | 
 11 | Main elements
 12 | -------------
 13 | 
 14 | rf_learner : A learner object
 15 | '''
 16 | 
 17 | from __future__ import division
 18 | import numpy as np
 19 | import milk.supervised.tree
 20 | from .normalise import normaliselabels
 21 | from .base import supervised_model
 22 | from ..utils import get_nprandom
 23 | 
 24 | __all__ = [
 25 |     'rf_learner',
 26 |     ]
 27 | 
 28 | def _sample(features, labels, n, R):
 29 |     '''
 30 |     features', labels' = _sample(features, labels, n, R)
 31 | 
 32 |     Sample n element from (features,labels)
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     features : sequence
 37 |     labels : sequence
 38 |         Same size as labels
 39 |     n : integer
 40 |     R : random object
 41 | 
 42 |     Returns
 43 |     -------
 44 |     features' : sequence
 45 |     labels' : sequence
 46 |     '''
 47 | 
 48 |     N = len(features)
 49 |     sfeatures = []
 50 |     slabels = []
 51 |     for i in range(n):
 52 |         idx = R.randint(N)
 53 |         sfeatures.append(features[idx])
 54 |         slabels.append(labels[idx])
 55 |     return np.array(sfeatures), np.array(slabels)
 56 | 
 57 | class rf_model(supervised_model):
 58 |     def __init__(self, forest, names, return_label = True):
 59 |         self.forest = forest
 60 |         self.names = names
 61 |         self.return_label = return_label
 62 | 
 63 |     def apply(self, features):
 64 |         rf = len(self.forest)
 65 |         votes = sum(t.apply(features) for t in self.forest)
 66 |         if self.return_label:
 67 |             return (votes > (rf//2))
 68 |         return votes / rf
 69 | 
 70 | 
 71 | class rf_learner(object):
 72 |     '''
 73 |     Random Forest Learner
 74 | 
 75 |     learner = rf_learner(rf=101, frac=.7)
 76 | 
 77 |     Attributes
 78 |     ----------
 79 |     rf : integer, optional
 80 |         Nr of trees to learn (default: 101)
 81 |     frac : float, optional
 82 |         Sample fraction
 83 |     R : np.random object
 84 |         Source of randomness
 85 |     '''
 86 |     def __init__(self, rf=101, frac=.7, R=None):
 87 |         self.rf = rf
 88 |         self.frac = frac
 89 |         self.R = get_nprandom(R)
 90 | 
 91 |     def train(self, features, labels, normalisedlabels=False, names=None, return_label=True, **kwargs):
 92 |         N,M = features.shape
 93 |         m = int(self.frac*M)
 94 |         n = int(self.frac*N)
 95 |         R = get_nprandom(kwargs.get('R', self.R))
 96 |         tree = milk.supervised.tree.tree_learner(return_label=return_label)
 97 |         forest = []
 98 |         if not normalisedlabels:
 99 |             labels,names = normaliselabels(labels)
100 |         elif names is None:
101 |             names = (0,1)
102 |         for i in range(self.rf):
103 |             forest.append(
104 |                     tree.train(*_sample(features, labels, n, R),
105 |                                **{'normalisedlabels' : True})) # This syntax is necessary for Python 2.5
106 |         return rf_model(forest, names, return_label)
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/milk/supervised/set2binary_array.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2008-2011, Luis Pedro Coelho <luis@luispedro.org>
 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 4 | #
 5 | # License: MIT. See COPYING.MIT file in the milk distribution
 6 | 
 7 | 
 8 | import numpy as np
 9 | 
10 | __all__ = [
11 |     'set2binary_array',
12 |     ]
13 | 
14 | class set2binary_array_model(object):
15 |     def __init__(self, universe):
16 |         self.universe = list(universe)
17 | 
18 |     def apply(self, elems):
19 |         res = np.zeros(len(self.universe) + 1, bool)
20 |         for e in elems:
21 |             try:
22 |                 res[self.universe.index(e)] = True
23 |             except :
24 |                 res[-1] = True
25 |         return res
26 | 
27 | class set2binary_array(object):
28 |     def train(self, features, labels, normalisedlabels=False):
29 |         allfeatures = set()
30 |         for f in features:
31 |             allfeatures.update(f)
32 |         return set2binary_array_model(allfeatures)
33 | 


--------------------------------------------------------------------------------
/milk/supervised/weighted_voting_adaboost.py:
--------------------------------------------------------------------------------
 1 | from math import exp, log
 2 | from operator import itemgetter
 3 | 
 4 | '''
 5 | AdaBoost implementation with weighted voting as a decision procedure
 6 | '''
 7 | class weighted_voting_adaboost(object):
 8 |     # initializes with already built classifiers and corresponding 
 9 |     def __init__(self, in_classifiers, in_coefficients):
10 |         self.classifiers = in_classifiers
11 |         self.coefficients = in_coefficients
12 |     
13 |     # decision by weighted voting
14 |     def apply(self, in_features):
15 |         # a "class number" => "votes value" mapping
16 |         answers = {}
17 |         for classifier, coefficient in zip(self.classifiers, self.coefficients):
18 |             answer = classifier.apply(in_features)
19 |             if answer in answers:
20 |                 answers[answer] += coefficient
21 |             else:
22 |                 answers[answer] = coefficient
23 |         # dict maximum by value
24 |         result = max(iter(answers.items()), key=itemgetter(1))
25 |         return result[0]
26 |          
27 | 
28 | class weighted_voting_ada_learner(object):
29 |     def __init__(self, in_composition_size, in_learner):
30 |         self.learner = in_learner
31 |         self.composition_size = in_composition_size
32 |     
33 |     def reset(self, in_features):
34 |         self.classifiers = []
35 |         # linear coefficients for the classifiers in composition
36 |         self.coefficients = []
37 |         self.weights = [1. / float(len(in_features))] * len(in_features)
38 | 
39 |     def train(self, in_features, in_labels):
40 |         self.reset(in_features)
41 |         
42 |         for iteration in range(self.composition_size):
43 |             self.classifiers.append(self.learner.train(in_features, in_labels, weights=self.weights))
44 |             # new classifier initially gets weight 1
45 |             self.coefficients.append(1)
46 |             answers = []
47 |             for obj in in_features:
48 |                 answers.append(self.classifiers[-1].apply(obj))
49 |             err = self.compute_weighted_error(in_labels, answers)
50 |             if abs(err) < 1e-6:
51 | 	            return weighted_voting_adaboost(self.classifiers, self.coefficients)
52 |             
53 |             alpha = 0.5 * log((1.0 - err) / err)
54 |             # updating the coefficient of the last added classifier
55 |             self.coefficients[-1] = alpha
56 |             
57 |             self.update_weights(in_labels, answers, alpha)
58 |             self.normalize_weights()
59 |         return weighted_voting_adaboost(self.classifiers, self.coefficients)
60 | 
61 |     def compute_weighted_error(self, in_labels, in_answers):
62 |         error = 0.
63 |         w_sum = sum(self.weights)
64 |         for ind in range(len(in_labels)):
65 |             error += (in_answers[ind] != in_labels[ind]) * self.weights[ind] / w_sum
66 |         return error
67 | 
68 |     def update_weights(self, in_labels, in_answers, in_alpha):
69 |         for ind in range(len(in_labels)):
70 |             self.weights[ind] *= exp(in_alpha * (in_answers[ind] != in_labels[ind]))
71 | 
72 |     def normalize_weights(self):
73 |         w_sum = sum(self.weights)
74 | 
75 |         for ind in range(len(self.weights)):
76 |             self.weights[ind] /= w_sum
77 | 


--------------------------------------------------------------------------------
/milk/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import milksets
 3 |     del milksets
 4 | except ImportError:
 5 |     import sys
 6 |     sys.stderr.write('''\
 7 |     Could not import milksets.
 8 | 
 9 |     This companion package does not provide any functionality, but
10 |     is necessary for some of the testing.''')
11 | 
12 | 
13 | def run(verbose=False):
14 |     import nose
15 |     from os import path
16 |     currentdir = path.dirname(__file__)
17 |     updir = path.join(currentdir, '..')
18 |     argv = ['', '--exe', '-w', updir]
19 |     if verbose:
20 |         argv.append('--verbose')
21 |     nose.run('milk', argv=argv)
22 | 
23 | 


--------------------------------------------------------------------------------
/milk/tests/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/milk/abc2a28b526c199414d42c0a26092938968c3caf/milk/tests/data/__init__.py


--------------------------------------------------------------------------------
/milk/tests/data/jugparallel_jugfile.py:
--------------------------------------------------------------------------------
1 | import milk.ext.jugparallel
2 | from milksets.wine import load
3 | from milk.tests.fast_classifier import fast_classifier
4 | features,labels = load()
5 | classified = milk.ext.jugparallel.nfoldcrossvalidation(features, labels, learner=fast_classifier())
6 | classified_wpred = milk.ext.jugparallel.nfoldcrossvalidation(features, labels, learner=fast_classifier(), return_predictions=True)
7 | 
8 | 


--------------------------------------------------------------------------------
/milk/tests/data/jugparallel_kmeans_jugfile.py:
--------------------------------------------------------------------------------
1 | import milk.ext.jugparallel
2 | from milksets.wine import load
3 | from milk.tests.fast_classifier import fast_classifier
4 | features,labels = load()
5 | 
6 | clustered = milk.ext.jugparallel.kmeans_select_best(features, ks=(2,8), repeats=2, max_iters=6)
7 | 
8 | 


--------------------------------------------------------------------------------
/milk/tests/data/regression-2-Dec-2009.pp.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/milk/abc2a28b526c199414d42c0a26092938968c3caf/milk/tests/data/regression-2-Dec-2009.pp.gz


--------------------------------------------------------------------------------
/milk/tests/fast_classifier.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from milk.supervised.base import supervised_model
 3 | class fast_classifier(object):
 4 |     def __init__(self):
 5 |         pass
 6 | 
 7 |     def set_option(self, _k, _v):
 8 |         pass
 9 | 
10 |     def train(self, features, labels, **kwargs):
11 |         examples = {}
12 |         for f,lab in zip(features, labels):
13 |             if lab not in examples:
14 |                 examples[lab] = f
15 |         return fast_model(examples)
16 | 
17 | class fast_model(supervised_model):
18 |     def __init__(self, examples):
19 |         self.examples = examples
20 |         assert len(self.examples)
21 | 
22 |     def apply(self, f):
23 |         best = None
24 |         best_val = +np.inf
25 |         for k,v in self.examples.items():
26 |             d = v-f
27 |             dist = np.dot(d,d)
28 |             if dist < best_val:
29 |                 best = k
30 |                 best_val = dist
31 |         return best
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/milk/tests/test_adaboost.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import milk.supervised.tree
 3 | import milk.supervised.adaboost
 4 | def test_learner():
 5 |     from milksets import wine
 6 |     learner = milk.supervised.adaboost.boost_learner(milk.supervised.tree.stump_learner())
 7 |     features, labels = wine.load()
 8 |     features = features[labels < 2]
 9 |     labels = labels[labels < 2] == 0
10 |     labels = labels.astype(int)
11 |     model = learner.train(features[::2], labels[::2])
12 |     train_out = np.array(list(map(model.apply, features)))
13 |     assert (train_out == labels).mean() > .9
14 | 
15 | 
16 | def test_too_many_boolean_indices_regression():
17 |     import milk.supervised.randomforest
18 |     import milk.supervised.adaboost
19 |     import milksets.wine
20 |     from milk.supervised.multi import one_against_one
21 | 
22 |     weak = milk.supervised.randomforest.rf_learner()
23 |     learner = milk.supervised.adaboost.boost_learner(weak)
24 |     learner = one_against_one(learner)
25 | 
26 |     features, labels = milksets.wine.load()
27 | 
28 |     # sample features so that the test is faster (still gives error):
29 |     learner.train(features[::16], labels[::16])
30 | 


--------------------------------------------------------------------------------
/milk/tests/test_affinity.py:
--------------------------------------------------------------------------------
 1 | import milk.unsupervised.affinity
 2 | import numpy as np
 3 | def test_affinity():
 4 |     np.random.seed(22)
 5 |     X = np.random.randn(100,10)
 6 |     X[:40] += .4
 7 |     S = milk.unsupervised.pdist(X)
 8 |     clusters, labels = milk.unsupervised.affinity.affinity_propagation(S)
 9 |     assert labels.max()+1 == len(clusters)
10 |     assert len(labels) == len(X)
11 |     assert clusters.max() < len(X)
12 | 


--------------------------------------------------------------------------------
/milk/tests/test_basic.py:
--------------------------------------------------------------------------------
1 | def test_import():
2 |     import milk
3 | 


--------------------------------------------------------------------------------
/milk/tests/test_curves.py:
--------------------------------------------------------------------------------
 1 | from milk.measures.curves import precision_recall
 2 | import numpy as np
 3 | def test_precision_recall():
 4 |     labels = [0,1]*10
 5 |     values = np.linspace(0,1,len(labels))
 6 |     precision, recall = precision_recall(values, labels)
 7 |     assert np.min(recall) >= 0.
 8 |     assert np.max(recall) <= 1.
 9 |     assert np.max(precision) <= 1.
10 |     assert np.min(precision) >= 0.
11 | 
12 |     labels = [0]*10 + [1] * 10
13 |     values = np.linspace(0,1.,20)
14 |     precision,recall = precision_recall(values, labels, 'steps', 10)
15 |     assert min(precision) >= .5
16 |     assert max(precision) == 1.
17 |     assert max(recall) == 1.
18 | 
19 | 


--------------------------------------------------------------------------------
/milk/tests/test_defaultclassifier.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import milk
 3 | import milk.supervised.defaultclassifier
 4 | import pickle
 5 | 
 6 | def test_defaultclassifier():
 7 |     from milksets import wine
 8 |     features, labels = wine.load()
 9 |     C = milk.supervised.defaultclassifier()
10 |     model = C.train(features,labels)
11 |     labelset = set(labels)
12 |     for f in features:
13 |         assert model.apply(f) in labelset
14 | test_defaultclassifier.slow = True
15 | 
16 | def test_pickle():
17 |     np.random.seed(23232432)
18 |     X = np.random.rand(100,10)
19 |     labels = np.zeros(100)
20 |     X[50:] += .5
21 |     labels[50:] = 1
22 |     classifier = milk.supervised.defaultclassifier()
23 |     model = classifier.train(X, labels)
24 |     s = pickle.dumps(model)
25 |     model = pickle.loads(s)
26 |     test = [model.apply(x) for x in X]
27 |     test = np.array(test)
28 |     assert (test == labels).mean() > .6
29 | 
30 | def test_pickle_learner():
31 |     learner = milk.defaultlearner()
32 |     assert len(pickle.dumps(learner))
33 | 
34 | def test_expandend():
35 |     np.random.seed(23232432)
36 |     X = np.random.rand(100,10)
37 |     labels = np.zeros(100)
38 |     X[50:] += .5
39 |     labels[50:] = 1
40 |     learners = milk.defaultlearner(expanded=True)
41 |     for learner in learners:
42 |         model = learner.train(X, labels)
43 |         test = [model.apply(x) for x in X]
44 |         test = np.array(test)
45 |         assert set(test) == set(labels)
46 | 
47 | 


--------------------------------------------------------------------------------
/milk/tests/test_defaultlearner.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import milk
 3 | def test_extra_arg():
 4 |     from milksets.wine import load
 5 |     features,labels = load()
 6 |     learner = milk.defaultlearner()
 7 |     model = learner.train(features[::2],labels[::2], extra_arg=5)
 8 |     assert model.apply(features[1]) < 12.
 9 | 
10 | 
11 | def test_empty_input():
12 |     learn = milk.defaultlearner()
13 |     X = np.random.rand(60, 3)
14 |     X[:32] += .52
15 |     y = np.arange(60) > 35
16 |     model = learn.train(X, y)
17 |     preds = model.apply_many([])
18 |     assert len(preds) == 0
19 | 


--------------------------------------------------------------------------------
/milk/tests/test_ecoc_learner.py:
--------------------------------------------------------------------------------
 1 | from milk.supervised.multi import ecoc_learner
 2 | from milk.supervised.classifier import ctransforms
 3 | from milk.supervised import svm
 4 | import milk.tests.fast_classifier
 5 | import milk.supervised.multi
 6 | from milksets.yeast import load
 7 | import numpy as np
 8 | 
 9 | def test_ecoc_learner():
10 |     base = milk.tests.fast_classifier.fast_classifier()
11 |     learner = milk.supervised.multi.ecoc_learner(base)
12 |     features, labels = load()
13 |     nlabels = len(set(labels))
14 |     model = learner.train(features[::2],labels[::2])
15 | 
16 |     testl = np.array(model.apply_many(features[1::2]))
17 |     assert np.mean(testl == labels[1::2]) > 1./nlabels
18 |     assert testl.min() >= 0
19 |     assert testl.max() < nlabels
20 | 
21 | # This failed at one point:
22 |     learner = ecoc_learner(svm.svm_to_binary(svm.svm_raw(kernel=svm.dot_kernel(), C=1.)))
23 |     model = learner.train(features[:200], labels[:200])
24 |     assert (model is not None)
25 | 
26 | def test_ecoc_probability():
27 |     features,labels = load()
28 |     features = features[labels < 5]
29 |     labels = labels[labels < 5]
30 |     raw = svm.svm_raw(kernel=svm.dot_kernel(), C=1.)
31 |     base = ctransforms(raw, svm.svm_sigmoidal_correction())
32 |     learner = ecoc_learner(base, probability=True)
33 |     model = learner.train(features[::2], labels[::2])
34 |     results = list(map(model.apply, features[1::2]))
35 |     results = np.array(results)
36 |     assert results.shape[1] == len(set(labels))
37 |     assert np.mean(results.argmax(1) == labels[1::2]) > .5
38 | 


--------------------------------------------------------------------------------
/milk/tests/test_ext_jugparallel.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import jug
 3 |     from jug import value
 4 |     import jug.options
 5 |     from jug.tests.utils import task_reset, simple_execute
 6 | except ImportError:
 7 |     from nose import SkipTest
 8 |     def task_reset(f):
 9 |         def g():
10 |             raise SkipTest()
11 |         return g
12 | 
13 | @task_reset
14 | def test_nfoldcrossvalidation():
15 |     store, space = jug.jug.init('milk/tests/data/jugparallel_jugfile.py', 'dict_store')
16 |     simple_execute()
17 |     assert len(jug.value(space['classified'])) == 2
18 |     assert len(jug.value(space['classified_wpred'])) ==3
19 | 
20 | 
21 | @task_reset
22 | def test_kmeans():
23 |     store, space = jug.jug.init('milk/tests/data/jugparallel_kmeans_jugfile.py', 'dict_store')
24 |     simple_execute()
25 |     assert len(value(space['clustered'])) == 2
26 | 


--------------------------------------------------------------------------------
/milk/tests/test_featureselection.py:
--------------------------------------------------------------------------------
 1 | import milk.supervised.featureselection
 2 | from milk.supervised.featureselection import select_n_best, rank_corr
 3 | import numpy as np
 4 | def test_sda():
 5 |     from milksets import wine
 6 |     features, labels = wine.load()
 7 |     selected = milk.supervised.featureselection.sda(features,labels)
 8 |     for sel in selected:
 9 |         assert sel <= features.shape[1]
10 | 
11 | def test_linear_independent_features():
12 |     np.random.seed(122)
13 |     X3 = np.random.rand(20,3)
14 |     X = np.c_[X3,X3*2+np.random.rand(20,3)/20.,-X3*2+np.random.rand(20,3)/10.]
15 |     X2 = np.c_[X3,X3*2,-X3*3e-3]
16 |     assert len(milk.supervised.featureselection.linear_independent_features(X)) == 9
17 |     assert len(milk.supervised.featureselection.linear_independent_features(X2)) == 3
18 |     assert np.all (np.sort(milk.supervised.featureselection.linear_independent_features(X2) % 3) == np.arange(3))
19 | 
20 | def _rank(A,tol=1e-8):
21 |     s = np.linalg.svd(A,compute_uv=0)
22 |     return (s > tol).sum()
23 | 
24 | def _slow_linear_independent_features(featmatrix):
25 |     '''
26 |     Returns the indices of a set of linearly independent features (columns).
27 | 
28 |     indices = linear_independent_features(features)
29 |     '''
30 |     independent = [0,]
31 |     rank = 1
32 |     feat = [featmatrix[:,0]]
33 |     for i,col in enumerate(featmatrix.T):
34 |         feat.append(col)
35 |         nrank = _rank(np.array(feat))
36 |         if nrank == rank:
37 |             del feat[-1]
38 |         else:
39 |             rank = nrank
40 |             independent.append(i)
41 |     return np.array(independent)
42 | 
43 | 
44 | def test_select_n():
45 |     from milksets.wine import load
46 | 
47 |     features,labels = load()
48 |     for n in (1,2,4,8):
49 |         select = select_n_best(n, rank_corr)
50 |         model = select.train(features,labels)
51 |         f = model.apply(features[3])
52 |         assert len(f) == n
53 | 
54 | def test_select_n():
55 |     from milksets.wine import load
56 | 
57 |     features,labels = load()
58 |     for n in (1,2,4,8):
59 |         select = select_n_best(n, rank_corr)
60 |         model = select.train(features,labels)
61 |         f = model.apply(features[3])
62 |         assert len(f) == n
63 | 
64 | def slow_rank_corr(features, labels):
65 |     features = np.asanyarray(features)
66 |     labels = np.asanyarray(labels)
67 |     binlabels = [(labels == ell) for ell in set(labels)]
68 |     rs = []
69 |     for feat in features.T:
70 |         ranks = feat.argsort()
71 |         corrcoefs = [np.corrcoef(ranks, labs)[0,1] for labs in binlabels]
72 |         corrcoefs = np.array(corrcoefs)
73 |         corrcoefs **= 2
74 |         rs.append(np.max(corrcoefs))
75 |     return np.array(rs)
76 | 
77 | def test_compare_rank_corr():
78 |     from milksets.wine import load
79 |     features,labels = load()
80 |     r0 = rank_corr(features,labels)
81 |     r1 = slow_rank_corr(features,labels)
82 |     assert np.allclose(r0,r1)
83 | 


--------------------------------------------------------------------------------
/milk/tests/test_fisher.py:
--------------------------------------------------------------------------------
 1 | import milk.supervised.svm
 2 | import milk.supervised.normalise
 3 | import numpy as np
 4 | import milk.supervised.svm
 5 | 
 6 | def _slow_f(features,labels,kernel_or_sigma):
 7 |     try:
 8 |         kernel = kernel_or_sigma
 9 |         kernel(features[0],features[1])
10 |     except:
11 |         kernel = milk.supervised.svm.rbf_kernel(kernel_or_sigma)
12 |     N1 = (labels == 0).sum()
13 |     N2 = (labels == 1).sum()
14 |     x1 = features[labels == 0]
15 |     x2 = features[labels == 1]
16 |     dm = 0
17 |     for i in range(N1):
18 |         for j in range(N1):
19 |             dm += kernel(x1[i],x1[j])/N1/N1
20 |     for i in range(N2):
21 |         for j in range(N2):
22 |             dm += kernel(x2[i],x2[j])/N2/N2
23 |     for i in range(N1):
24 |         for j in range(N2):
25 |             dm -= 2*kernel(x1[i],x2[j])/N1/N2
26 |     s1 = N1
27 |     for i in range(N1):
28 |         for j in range(N1):
29 |             s1 -= kernel(x1[i],x1[j])/N1
30 |     s2 = N2
31 |     for i in range(N2):
32 |         for j in range(N2):
33 |             s2 -= kernel(x2[i],x2[j])/N2
34 |     return (s1 + s2)/dm
35 | 
36 | 
37 | def test_fisher_approx():
38 |     from milksets import wine
39 |     features,labels = wine.load()
40 |     f = milk.supervised.svm.sigma_value_fisher(features,labels)
41 |     for sigma in (2.**-4,2.,16.,32.):
42 |         assert abs(f(sigma) - _slow_f(features,labels,sigma)) < 1e-6
43 | 


--------------------------------------------------------------------------------
/milk/tests/test_gaussianmixture.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from milk.unsupervised import gaussianmixture
 3 | 
 4 | def _sq(x):
 5 |     return x*x
 6 | def test_gm():
 7 |     np.random.seed(22)
 8 |     centroids = np.repeat(np.arange(4), 4).reshape((4,4))
 9 |     fmatrix = np.concatenate([(np.random.randn(12,4)+c) for c in centroids])
10 |     assignments = np.repeat(np.arange(4), 12)
11 |     rss = sum(np.sum(_sq(fmatrix[i*12:(i+1)*12]-i)) for i in range(4))
12 |     assert np.abs(gaussianmixture.residual_sum_squares(fmatrix, assignments, centroids) - rss) < 1.e-12
13 |     assert gaussianmixture.BIC(fmatrix, assignments, centroids) > 0
14 |     assert gaussianmixture.AIC(fmatrix, assignments, centroids) > 0
15 | 
16 |     assert gaussianmixture.BIC(fmatrix, assignments, centroids, model='full_covariance') > \
17 |         gaussianmixture.BIC(fmatrix, assignments, centroids, model='diagonal_covariance') > \
18 |         gaussianmixture.BIC(fmatrix, assignments, centroids, model='one_variance')
19 | 
20 |     assert gaussianmixture.AIC(fmatrix, assignments, centroids, model='full_covariance') > \
21 |         gaussianmixture.AIC(fmatrix, assignments, centroids, model='diagonal_covariance') > \
22 |         gaussianmixture.AIC(fmatrix, assignments, centroids, model='one_variance')
23 | 
24 | 


--------------------------------------------------------------------------------
/milk/tests/test_gridsearch.py:
--------------------------------------------------------------------------------
  1 | import milk.supervised.gridsearch
  2 | import milk.supervised.svm
  3 | from milk.supervised.gridsearch import gridminimise, _allassignments, gridsearch
  4 | from milk.tests.fast_classifier import fast_classifier
  5 | from nose.tools import raises
  6 | import numpy as np
  7 | 
  8 | 
  9 | def slow_gridminimise(learner, features, labels, params, measure=None):
 10 |     from ..measures.nfoldcrossvalidation import nfoldcrossvalidation
 11 |     if measure is None:
 12 |         measure = np.trace
 13 | 
 14 |     best_val = initial_value
 15 |     best = None
 16 |     for assignement in _allassignments(params):
 17 |         _set_assignment(learner, assignement)
 18 |         S,_ = nfoldcrossvalidation(features, labels, classifier=learner)
 19 |         cur = measure(S)
 20 |         if cur > best_val:
 21 |             best = assignement
 22 |             best_val = cur
 23 |     return best
 24 | 
 25 | 
 26 | def test_gridsearch():
 27 |     from milksets import wine
 28 |     features, labels = wine.load()
 29 |     selected = (labels < 2)
 30 |     features = features[selected]
 31 |     labels = labels[selected]
 32 | 
 33 |     G = milk.supervised.gridsearch(
 34 |             milk.supervised.svm.svm_raw(),
 35 |             params={'C':[.01,.1,1.,10.],
 36 |                     'kernel':[milk.supervised.svm.rbf_kernel(0.1),milk.supervised.svm.rbf_kernel(1.)]
 37 |             })
 38 |     model = G.train(features,labels)
 39 |     reslabels = [model.apply(f) for f in features]
 40 |     assert len(reslabels) == len(features)
 41 | test_gridsearch.slow = True
 42 | 
 43 | 
 44 | def test_all_assignements():
 45 |     assert len(list(_allassignments({'C': [0,1], 'kernel' : ['a','b','c']}))) == 2 * 3
 46 | 
 47 | class error_learner(object):
 48 |     def train(self, features, labels, **kwargs):
 49 |         raise ValueError('oops')
 50 |     
 51 |     def set_option(self, k, v):
 52 |         pass
 53 | 
 54 | @raises(Exception)
 55 | def test_with_error():
 56 |     from milksets.wine import load
 57 |     features, labels = load()
 58 |     learner = error_learner()
 59 |     G = milk.supervised.gridsearch(
 60 |         error_learner(),
 61 |         params = { 'error' : list(range(3)), 'error2' : list(range(5)) }
 62 |         )
 63 |     G.train(features,labels)
 64 |     
 65 | 
 66 | class simple_model:
 67 |     def __init__(self, c):
 68 |         self.c = c
 69 |     def apply(self, f):
 70 |         return self.c
 71 | 
 72 | def f(a,b,c):
 73 |     return a**2 + b**3 + c
 74 | 
 75 | class simple_learner:
 76 |     def set_option(self, k, v):
 77 |         setattr(self, k, v)
 78 |     def train(self, fs, ls, normalisedlabels=False):
 79 |         return simple_model(f(self.a, self.b, self.c))
 80 | 
 81 | def test_gridminimise():
 82 |     features = np.arange(100)
 83 |     labels = np.tile((0,1), 50)
 84 |     paramspace = { 'a': np.arange(4), 'b' : np.arange(-3,3), 'c' : np.linspace(2., 10) }
 85 |     best,value = gridminimise(simple_learner(), features, labels, paramspace, measure=(lambda _, p: p[0]), return_value=True)
 86 |     best = dict(best)
 87 |     val = f(best['a'], best['b'], best['c'])
 88 |     assert value == val*100
 89 |     for a in np.arange(4):
 90 |         for b in np.arange(-3,3):
 91 |             for c in np.linspace(2., 10):
 92 |                 assert val <= f(a,b,c)
 93 |     gs = gridsearch(simple_learner(), paramspace, measure=(lambda _, p: p[0]), annotate=True)
 94 |     model = gs.train(features, labels)
 95 |     assert model.value == value
 96 |     assert model.arguments == val
 97 | 
 98 | def test_gridminimise():
 99 |     from milksets.wine import load
100 |     features, labels = load()
101 |     x = gridminimise(milk.supervised.svm_simple(kernel=np.dot, C=2.), features[::2], labels[::2] == 0, {'C' : (0.5,) })
102 |     cval, = x
103 |     assert cval == ('C', .5)
104 | 
105 | def test_gridminimise_return():
106 |     from milksets.wine import load
107 |     features,labels = load()
108 |     learner = fast_classifier()
109 |     gridminimise(learner, features, labels, { 'ignore' : [0] })
110 |     _,error = gridminimise(learner, features, labels, { 'ignore' : [0] }, return_value=True, nfolds=5)
111 |     cmat,_ = milk.nfoldcrossvalidation(features, labels, learner=learner, nfolds=5)
112 |     assert error == cmat.sum()-cmat.trace()
113 | 


--------------------------------------------------------------------------------
/milk/tests/test_grouped.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import milk.supervised.svm
 3 | from milk.supervised.svm import rbf_kernel
 4 | import milk.supervised.multi
 5 | import milk.supervised.grouped
 6 | from milk.supervised.classifier import ctransforms
 7 | import milksets.wine
 8 | 
 9 | def group(features, labels, step):
10 |     N = len(labels)
11 |     i = 0
12 |     gfeatures = []
13 |     glabels = []
14 |     while i < N:
15 |         next = i + step
16 |         while next > N or labels[next-1] != labels[i]: next -= 1
17 |         gfeatures.append(features[i:next])
18 |         glabels.append(labels[i])
19 |         i = next
20 |     return gfeatures, glabels
21 | 
22 | 
23 | 
24 | def test_voting():
25 |     base = ctransforms(milk.supervised.svm.svm_raw(C=2.,kernel=milk.supervised.svm.rbf_kernel(2.**-3)),milk.supervised.svm.svm_binary())
26 |     base = milk.supervised.multi.one_against_rest(base)
27 |     features,labels = milksets.wine.load()
28 |     gfeatures, glabels = group(features, labels, 3)
29 | 
30 |     learner = milk.supervised.grouped.voting_classifier(base)
31 |     learner.train(gfeatures, glabels)
32 |     model = learner.train(gfeatures, glabels)
33 |     assert ([model.apply(f) for f in gfeatures] == np.array(glabels)).mean() > .8
34 | 
35 | 
36 | def test_filter_outliers():
37 |     np.random.seed(22)
38 |     features = [np.random.randn(10,10) for i in range(20)]
39 |     for f in features:
40 |         f[0] *= 10
41 |         
42 |     trainer = milk.supervised.grouped.filter_outliers(.9)
43 |     model = trainer.train(features, [0] * len(features))
44 |     for f in features:
45 |         ff = model.apply(f)
46 |         assert np.all(ff == f[1:])
47 | 
48 | 
49 | 
50 | def test_nfoldcrossvalidation():
51 |     np.random.seed(22)
52 |     features = np.array([np.random.rand(8+(i%3), 12)*(i//20) for i in range(40)], dtype=object)
53 |     labels = np.zeros(40, int)
54 |     labels[20:] = 1
55 |     classifier = milk.supervised.grouped.voting_classifier(milk.supervised.svm_simple(C=1., kernel=rbf_kernel(1./12)))
56 |     cmat, names = milk.nfoldcrossvalidation(features, labels, classifier=classifier)
57 |     assert cmat.shape == (2,2)
58 |     assert sorted(names) == list(range(2))
59 | 
60 | 
61 | 
62 | class identity_classifier(object):
63 |     def train(self, features, labels):
64 |         return identity_model()
65 | 
66 | class identity_model(object):
67 |     def apply(self, f):
68 |         return f
69 |     
70 | 
71 | def test_meanclassif():
72 |     gfeatures = [np.arange(10), np.arange(10)%2]
73 |     glabels = [0,1]
74 |     meanclassif = milk.supervised.grouped.mean_classifier(identity_classifier())
75 |     model = meanclassif.train(gfeatures, glabels)
76 |     assert model.apply(gfeatures[0]) == np.arange(10).mean()
77 |     assert model.apply(gfeatures[1]) == .5
78 | 
79 | 


--------------------------------------------------------------------------------
/milk/tests/test_kmeans.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import milk.unsupervised
 3 | from milk.unsupervised.kmeans import assign_centroids, repeated_kmeans
 4 | 
 5 | def test_kmeans():
 6 |     np.random.seed(132)
 7 |     features = np.r_[np.random.rand(20,3)-.5,.5+np.random.rand(20,3)]
 8 |     def test_distance(dist, kwargs={}):
 9 |         centroids, _ = milk.unsupervised.kmeans(features, 2, distance=dist, **kwargs)
10 |         positions = [0]*20 + [1]*20
11 |         correct = (centroids == positions).sum()
12 |         assert correct >= 38 or correct <= 2
13 |     yield test_distance, 'euclidean'
14 |     yield test_distance, 'seuclidean'
15 |     yield test_distance, 'mahalanobis', { 'icov' : np.eye(3) }
16 | 
17 | def test_kmeans_centroids():
18 |     np.random.seed(132)
19 |     features = np.random.rand(201,30)
20 |     for k in [2,3,5,10]:
21 |         indices,centroids = milk.unsupervised.kmeans(features, k)
22 |         for i in range(k):
23 |             if np.any(indices == i):
24 |                 assert np.allclose(centroids[i], features[indices == i].mean(0))
25 | 
26 | 
27 | def test_assign_cids():
28 |     from milksets.wine import load
29 |     features,_ = load()
30 |     assigns, centroids = milk.unsupervised.kmeans(features, 3, R=2, max_iters=10)
31 |     assert np.all(assign_centroids(features, centroids) == assigns)
32 | 
33 | def test_non_contiguous_fmatrix():
34 |     from milksets.wine import load
35 |     features,_ = load()
36 |     features = features[:,::2]
37 |     assigns, centroids = milk.unsupervised.kmeans(features, 3, R=2, max_iters=10)
38 |     assert np.all(assign_centroids(features, centroids) == assigns)
39 | 
40 |     features = features.astype(np.int32)
41 |     assigns, centroids = milk.unsupervised.kmeans(features, 3, R=2, max_iters=10)
42 |     assert np.all(assign_centroids(features, centroids) == assigns)
43 | 
44 | 
45 | def test_repeated_kmeans():
46 |     np.random.seed(132)
47 |     features = np.random.rand(201,30)
48 |     cids,cs = repeated_kmeans(features, 3, 2)
49 |     assert len(cids) == len(features)
50 | 
51 | def test_kmeans_return_partial():
52 |     np.random.seed(132)
53 |     features = np.r_[np.random.rand(20,3)-.5,.5+np.random.rand(20,3)]
54 |     assignments,centroids = milk.unsupervised.kmeans(features, 2, R=129)
55 |     centroids_ = milk.unsupervised.kmeans(features, 2, R=129, return_assignments=False)
56 |     assignments_ = milk.unsupervised.kmeans(features, 2, R=129, return_centroids=False)
57 |     assert np.all(centroids == centroids_)
58 |     assert np.all(assignments == assignments_)
59 | 
60 | 
61 | 
62 | def test_kmeans_all_equal():
63 |     import milk.unsupervised._kmeans
64 |     np.random.seed(132)
65 |     for _ in range(8):
66 |         a = (np.random.random(1024*128)*250).astype(int)
67 |         b = a.copy()
68 |         assert milk.unsupervised._kmeans.are_equal(a,b)
69 |         a[3435] += 1
70 |         assert not milk.unsupervised._kmeans.are_equal(a,b)
71 | 


--------------------------------------------------------------------------------
/milk/tests/test_knn.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import milk.supervised.knn
 3 | 
 4 | def test_simple():
 5 |     X=np.array([
 6 |         [0,0,0],   
 7 |         [1,1,1],   
 8 |         ])         
 9 |     Y=np.array([ 1, -1 ])
10 |     kNN = milk.supervised.knn.kNN(1)
11 |     kNN = kNN.train(X,Y)
12 |     assert kNN.apply(X[0]) == Y[0]
13 |     assert kNN.apply(X[1]) == Y[1]
14 |     assert kNN.apply([0,0,1]) == Y[0]
15 |     assert kNN.apply([0,1,1]) == Y[1]
16 | 
17 | def test_nnclassifier():
18 |     labels=[0,1]
19 |     data=[[0.,0.],[1.,1.]]
20 |     C = milk.supervised.knn.kNN(1)
21 |     model = C.train(data,labels)
22 |     assert model.apply(data[0]) == 0
23 |     assert model.apply(data[1]) == 1
24 |     assert model.apply([.01,.01]) == 0
25 |     assert model.apply([.99,.99]) == 1
26 |     assert model.apply([100,100]) == 1
27 |     assert model.apply([-100,-100]) == 0
28 |     assert model.apply([.9,.9]) == 1
29 |     middle = model.apply([.5,.5])
30 |     assert (middle == 0) or (middle == 1)
31 | 
32 | def test_approx_nnclassifier():
33 |     import milksets.wine
34 |     features,labels = milksets.wine.load()
35 |     for k in (1,3,5):
36 |         learner = milk.supervised.knn.approximate_knn_learner(k)
37 |         model = learner.train(features[::2], labels[::2])
38 |         testing = model.apply_many(features[1::2])
39 |         assert np.mean(testing == labels[1::2]) > .5
40 | 


--------------------------------------------------------------------------------
/milk/tests/test_lasso.py:
--------------------------------------------------------------------------------
 1 | from milk.supervised.lasso import lasso_learner
 2 | import milk.supervised.lasso
 3 | import numpy as np
 4 | 
 5 | def test_lasso_smoke():
 6 |     np.random.seed(3)
 7 |     for i in range(8):
 8 |         X = np.random.rand(100,10)
 9 |         Y = np.random.rand(5,10)
10 |         B = np.random.rand(5,100)
11 |         before = np.linalg.norm(Y - np.dot(B,X))
12 |         B  = milk.supervised.lasso(X,Y)
13 |         after = np.linalg.norm(Y - np.dot(B,X))
14 |         assert after < before
15 |         assert np.all(~np.isnan(B))
16 | 
17 | def test_lasso_nans():
18 |     np.random.seed(3)
19 |     for i in range(8):
20 |         X = np.random.rand(100,10)
21 |         Y = np.random.rand(5,10)
22 |         B = np.random.rand(5,100)
23 |         for j in range(12):
24 |             Y.flat[np.random.randint(0,Y.size-1)] = float('nan')
25 |         B  = milk.supervised.lasso(X,Y)
26 |         assert np.all(~np.isnan(B))
27 | 
28 | def test_lam_zero():
29 |     np.random.seed(2)
30 |     for i in range(8):
31 |         X = np.random.rand(24,2)
32 |         Y = np.random.rand(1,2)
33 |         B  = milk.supervised.lasso(X,Y, lam=0.0)
34 |         R = Y - np.dot(B,X)
35 |         R = R.ravel()
36 |         assert np.dot(R,R) < .01
37 | 
38 | 
39 | def test_lasso_walk():
40 |     np.random.seed(5)
41 |     for i in range(4):
42 |         X = np.random.rand(100,10)
43 |         Y = np.random.rand(5,10)
44 |         Bs  = milk.supervised.lasso_walk(X,Y, start=.0001, nr_steps=3)
45 |         B0 = milk.supervised.lasso(X,Y, lam=.0001)
46 |         assert np.all(Bs[0] == B0)
47 |         assert not np.all(Bs[0] == Bs[-1])
48 |         assert len(Bs) == 3
49 | 
50 | def test_lasso_walk_nans():
51 |     np.random.seed(5)
52 |     for i in range(3):
53 |         X = np.random.rand(100,10)
54 |         Y = np.random.rand(5,10)
55 |         B = np.random.rand(5,100)
56 |         for j in range(12):
57 |             Y.flat[np.random.randint(0,Y.size-1)] = float('nan')
58 |         B  = milk.supervised.lasso_walk(X,Y, nr_steps=6)
59 |         assert np.all(~np.isnan(B))
60 | 
61 | 
62 | def test_learner():
63 |     np.random.seed(334)
64 |     learner = lasso_learner()
65 |     X = np.random.rand(100,10)
66 |     Y = np.random.rand(5,10)
67 |     model = learner.train(X,Y)
68 |     test = model.apply(np.random.rand(100))
69 |     assert len(test) == len(Y)
70 | 


--------------------------------------------------------------------------------
/milk/tests/test_logistic.py:
--------------------------------------------------------------------------------
 1 | import milk.supervised.logistic
 2 | import milksets.wine
 3 | import numpy as np
 4 | def test_better_than_random():
 5 |     learner = milk.supervised.logistic.logistic_learner()
 6 |     features, labels = milksets.wine.load()
 7 |     model = learner.train(features, labels == 0)
 8 |     error = np.array([np.abs(model.apply(f)-(l == 0))
 9 |                 for f,l in zip(features, labels)])
10 |     assert error.mean() < .1
11 | 


--------------------------------------------------------------------------------
/milk/tests/test_measures.py:
--------------------------------------------------------------------------------
 1 | import milk.measures.measures
 2 | import milk.measures.curves
 3 | import numpy as np
 4 | import numpy
 5 | from milk.measures import accuracy, waccuracy, bayesian_significance
 6 | 
 7 | def test_100():
 8 |     C=numpy.zeros((2,2))
 9 |     C[0,0]=100
10 |     C[1,1]=50
11 |     assert accuracy(C) == 1.
12 |     assert waccuracy(C) == 1.
13 | 
14 | def test_0():
15 |     C = numpy.array([
16 |         [0, 10],
17 |         [10, 0]
18 |         ])
19 |     assert waccuracy(C) == 0.
20 |     assert accuracy(C) == 0.
21 | 
22 | def test_50():
23 |     C = numpy.array([
24 |         [10, 10],
25 |         [10, 10]
26 |         ])
27 |     assert accuracy(C) == .5
28 |     assert waccuracy(C) == .5
29 | 
30 | def test_unbalanced():
31 |     C = numpy.array([
32 |         [20, 10],
33 |         [10,  0]
34 |         ])
35 |     assert accuracy(C) == .5
36 |     assert waccuracy(C) == 1./3
37 | 
38 | 
39 | 
40 | def test_confusion_matrix():
41 |     np.random.seed(323)
42 |     labels0 = np.arange(101)%3
43 |     labels1 = (labels0 + np.random.rand(101)*2).astype(np.int) % 3
44 |     cmat = milk.measures.measures.confusion_matrix(labels0, labels1)
45 |     for i in range(3):
46 |         for j in range(3):
47 |             assert cmat[i,j] == np.sum( (labels0 == i) & (labels1 == j) )
48 | 
49 | 
50 | 
51 | def test_significance():
52 |     assert np.allclose(.5, [bayesian_significance(1024,i,i) for i in range(0, 1025, 3)])
53 | 
54 | 
55 | def test_roc():
56 |     np.random.seed(3)
57 |     for i in range(4):
58 |         labels = np.repeat([False,True], 50)
59 |         response = labels + np.random.random(100)*i
60 |         P,R = milk.measures.curves.roc(response, labels != 0)
61 |         assert P.min() >= 0.
62 |         assert R.min() >= 0.
63 |         assert P.max() <= 1.
64 |         assert R.max() <= 1.
65 | 


--------------------------------------------------------------------------------
/milk/tests/test_measures_clusters.py:
--------------------------------------------------------------------------------
 1 | import milk.measures.cluster_agreement
 2 | import numpy as np
 3 | def test_rand_arand_jaccard():
 4 |     np.random.seed(33)
 5 | 
 6 |     labels = np.repeat(np.arange(4),10)
 7 |     clusters = np.repeat(np.arange(4),10)
 8 | 
 9 |     a0,b0,c0= milk.measures.cluster_agreement.rand_arand_jaccard(clusters, labels)
10 |     assert a0 == 1.
11 |     assert b0 == 1.
12 | 
13 |     np.random.shuffle(clusters)
14 |     a1,b1,c1= milk.measures.cluster_agreement.rand_arand_jaccard(clusters, labels)
15 |     assert a1 >= 0.
16 |     assert a1 < 1.
17 |     assert b1 < 1.
18 |     assert b1 >= 0.
19 |     assert c1 < c0
20 | 
21 | 


--------------------------------------------------------------------------------
/milk/tests/test_multi.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | import milk.supervised.svm
 4 | import milk.supervised.multi
 5 | from milk.supervised.classifier import ctransforms
 6 | from .fast_classifier import fast_classifier
 7 | 
 8 | import milksets.wine
 9 | features,labels = milksets.wine.load()
10 | A = np.arange(len(features))
11 | random.seed(9876543210)
12 | random.shuffle(A)
13 | features = features[A]
14 | labels = labels[A]
15 | labelset = set(labels)
16 | base = ctransforms(milk.supervised.svm.svm_raw(C=2.,kernel=milk.supervised.svm.rbf_kernel(2.**-3)),milk.supervised.svm.svm_binary())
17 | 
18 | def test_one_against_rest():
19 |     M = milk.supervised.multi.one_against_rest(base)
20 |     M = M.train(features[:100,:],labels[:100])
21 |     tlabels = [M.apply(f) for f in features[100:]]
22 |     for tl in tlabels:
23 |         assert tl in labelset
24 | 
25 | def test_one_against_one():
26 |     M = milk.supervised.multi.one_against_one(base)
27 |     M = M.train(features[:100,:],labels[:100])
28 |     tlabels = [M.apply(f) for f in features[100:]]
29 |     for tl in tlabels:
30 |         assert tl in labelset
31 |     tlabels_many = M.apply_many(features[100:])
32 |     assert np.all(tlabels == tlabels_many)
33 | 
34 | def test_two_thirds():
35 |     np.random.seed(2345)
36 |     C = milk.supervised.defaultclassifier('fast')
37 |     X = np.random.rand(120,4)
38 |     X[:40] += np.random.rand(40,4)
39 |     X[:40] += np.random.rand(40,4)
40 |     X[40:80] -= np.random.rand(40,4)
41 |     X[40:80] -= np.random.rand(40,4)
42 |     Y = np.repeat(np.arange(3), 40)
43 |     model = C.train(X,Y)
44 |     Y_ = np.array([model.apply(x) for x in X])
45 |     assert (Y_ == Y).mean() * 3 > 2
46 | 
47 | def test_multi_labels():
48 |     clabels = [[lab, lab+7] for lab in labels]
49 |     multi_label = milk.supervised.multi.one_against_rest_multi(base)
50 |     model = multi_label.train(features[::2], clabels[::2])
51 |     test_vals = [model.apply(f) for f in features[1::2]]
52 |     for ts in test_vals:
53 |         if 0.0 in ts: assert 7.0 in ts
54 |         if 1.0 in ts: assert 8.0 in ts
55 |         if 2.0 in ts: assert 9.0 in ts
56 | 
57 | 
58 | def test_classifier_no_set_options():
59 |     # Basically these should not raise an exception
60 |     milk.supervised.multi.one_against_rest_multi(fast_classifier())
61 |     milk.supervised.multi.one_against_rest(fast_classifier())
62 |     milk.supervised.multi.one_against_one(fast_classifier())
63 | 
64 | 
65 | def test_tree():
66 |     mtree = milk.supervised.multi.multi_tree_learner(fast_classifier())
67 |     labels = [0,1,2,2,3,3,3,3]
68 |     features =  np.random.random_sample((len(labels), 8))
69 |     model = mtree.train(features, labels)
70 |     counts = np.zeros(4)
71 |     for ell in labels:
72 |         counts[ell] += 1
73 | 
74 |     g0,g1 = milk.supervised.multi.split(counts)
75 |     assert np.all(g0 == [3]) or np.all(g1 == [3])
76 |     def list_to_zero(v):
77 |         if isinstance(v, list):
78 |             return 1000
79 |         return v
80 |     def r(m):
81 |         if len(m) == 1: return int(m[0])
82 |         else: return sorted([r(m[1]), r(m[2])], key=list_to_zero)
83 |     assert r(model.model) == [3,[2,[0,1]]]
84 | 
85 | 


--------------------------------------------------------------------------------
/milk/tests/test_multi_label.py:
--------------------------------------------------------------------------------
 1 | from milk.tests.fast_classifier import fast_classifier
 2 | import milk.supervised.multi_label
 3 | import milk
 4 | import numpy as np
 5 | 
 6 | def test_one_by_one():
 7 |     np.random.seed(23)
 8 |     r = np.random.random
 9 |     ps = np.array([.7,.5,.8,.3,.8])
10 |     learner = milk.supervised.multi_label.one_by_one(fast_classifier())
11 |     universe = list(range(len(ps)))
12 | 
13 |     for _ in range(10):
14 |         labels = []
15 |         features = []
16 |         bases = [np.random.rand(20) for pj in ps]
17 |         for i in range(256):
18 |             cur = []
19 |             curf = np.zeros(20,float)
20 |             for j,pj in enumerate(ps):
21 |                 if r() < pj:
22 |                     cur.append(j)
23 |                     curf += r()*bases[j]
24 |             if not cur: continue
25 |             labels.append(cur)
26 |             features.append(curf)
27 | 
28 |         model = learner.train(features, labels)
29 |         predicted = model.apply_many(features)
30 |         matrix = np.zeros((2,2), int)
31 |         for t,p in zip(labels, predicted):
32 |             for ell in universe:
33 |                 row = (ell in t)
34 |                 col = (ell in p)
35 |                 matrix[row,col] += 1
36 |         Tn,Fp = matrix[0]
37 |         Fn,Tp = matrix[1]
38 |         prec = Tp/float(Tp+Fp)
39 |         recall = Tp/float(Tp+Fn)
40 |         F1 = 2*prec*recall/(prec + recall)
41 |         assert F1 > .3
42 | 


--------------------------------------------------------------------------------
/milk/tests/test_multi_view.py:
--------------------------------------------------------------------------------
 1 | import milk.supervised.multi_view
 2 | import numpy as np
 3 | import milk.supervised.svm
 4 | from milk.supervised.defaultclassifier import feature_selection_simple
 5 | 
 6 | def test_multi_view():
 7 |     from milksets.wine import load
 8 |     features, labels = load()
 9 |     features0 = features[::10]
10 |     features1 = features[1::10]
11 |     features2 = features[2::10]
12 |     labels0 = labels[::10]
13 |     labels1 = labels[1::10]
14 |     labels2 = labels[2::10]
15 | 
16 |     assert np.all(labels0 == labels1)
17 |     assert np.all(labels1 == labels2)
18 |     labels = labels0
19 |     train_features = list(zip(features0,features1,features2))
20 |     test_features = list(zip(features[3::10], features[4::10], features[5::10]))
21 |     base = milk.supervised.classifier.ctransforms(
22 |                 feature_selection_simple(),
23 |                 milk.supervised.svm.svm_raw(C=128, kernel=milk.supervised.svm.rbf_kernel(4.)),
24 |                 milk.supervised.svm.svm_sigmoidal_correction()
25 |                 )
26 |     classifier = milk.supervised.multi_view.multi_view_classifier([base,base,base])
27 |     model = classifier.train(train_features, labels == 0)
28 |     assert ([model.apply(f) for f in test_features] == (labels == 0)).mean() > .9
29 | 


--------------------------------------------------------------------------------
/milk/tests/test_nnmf.py:
--------------------------------------------------------------------------------
 1 | import milk.unsupervised
 2 | import numpy as np
 3 | def test_nnmf():
 4 |     def test3(method):
 5 |         np.random.seed(8)
 6 |         X3 = np.random.rand(20,3)
 7 |         X = np.c_[  X3,
 8 |                     X3*2+np.random.rand(20,3)/20.,
 9 |                     -X3*2+np.random.rand(20,3)/10.]
10 |         W,V = method(X, 3, R=7)
11 |         assert np.sum((np.dot(W,V)-X)**2)/np.sum(X**2) < .5
12 | 
13 |     yield test3, milk.unsupervised.lee_seung
14 |     yield test3, milk.unsupervised.sparse_nnmf
15 | 
16 | def test_sparse_nnmf():
17 |     # This is really just a smoke test because the test case is not sparse!!
18 |     from milk.unsupervised import sparse_nnmf
19 |     np.random.seed(8)
20 |     X3 = np.random.rand(20,3)
21 |     X = np.c_[  X3,
22 |                 X3*2+np.random.rand(20,3)/20.,
23 |                 -X3*2+np.random.rand(20,3)/10.]
24 |     W,V = sparse_nnmf(X, 3, sparsenessW=.7, sparsenessH=.7, R=7)
25 |     assert not np.any(np.isnan(W))
26 |     assert not np.any(np.isnan(V))
27 |     error = np.dot(W,V)-X
28 |     assert error.var() < X.var()
29 | 
30 | 
31 | 
32 | def test_hoyer_project():
33 |     from milk.unsupervised.nnmf.hoyer import _L1for, _project
34 |     def sp(n, L1, L2):
35 |         return (np.sqrt(n) - L1/L2)/(np.sqrt(n) - 1)
36 |     sparseness = .6
37 |     n = 9.
38 |     row = np.arange(int(n))/n
39 |     L2 = np.sqrt(np.dot(row, row))
40 |     L1 = _L1for(sparseness, row, L2)
41 | 
42 |     assert np.abs(sp(n, L1, L2) - sparseness) < 1.e-4
43 |     row_ = _project(row, L1, L2)
44 |     assert not np.any(np.isnan(row_))
45 |     assert np.all(row_ >= 0)
46 | 
47 |     L2 = np.sqrt(np.dot(row, row))
48 |     L1 = np.sum(np.abs(row_))
49 |     res = sp(n, L1, L2)
50 |     assert np.abs(res - sparseness) < 1.e-4
51 | 
52 | 


--------------------------------------------------------------------------------
/milk/tests/test_normalise.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2008-2012, Luis Pedro Coelho <luis@luispedro.org>
 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 4 | # 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | #  of this software and associated documentation files (the "Software"), to deal
 7 | #  in the Software without restriction, including without limitation the rights
 8 | #  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | #  copies of the Software, and to permit persons to whom the Software is
10 | #  furnished to do so, subject to the following conditions:
11 | # 
12 | # The above copyright notice and this permission notice shall be included in
13 | #  all copies or substantial portions of the Software.
14 | # 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | #  THE SOFTWARE.
22 | 
23 | from __future__ import division
24 | import numpy
25 | import numpy as np
26 | from milk.supervised.normalise import sample_to_2min
27 | import milk.supervised.normalise
28 | 
29 | 
30 | def test_zscore_normalise():
31 |     I=milk.supervised.normalise.zscore_normalise()
32 |     numpy.random.seed(1234)
33 |     features = numpy.random.rand(20,100)
34 |     L = numpy.zeros(100)
35 |     model = I.train(features, L)
36 |     transformed = np.array([model.apply(f) for f in features])
37 |     assert np.all( transformed.mean(0)**2 < 1e-7 )
38 |     assert np.all( np.abs(transformed.std(0) - 1) < 1e-3 )
39 | 
40 |         
41 | def test_sample_to_2min():
42 |     A = np.zeros(256, np.int32)
43 |     def test_one(A):
44 |         selected = sample_to_2min(A)
45 |         ratios = []
46 |         for l0 in set(A):
47 |             for l1 in set(A):
48 |                 ratios.append( (A[selected] == l0).sum() / (A[selected] == l1).sum() )
49 |         assert np.max(ratios) <= 2.001
50 |     A[20:] = 1
51 |     yield test_one, A
52 | 
53 |     A[21:] = 1
54 |     yield test_one, A
55 | 
56 |     A[129:] = 2
57 |     yield test_one, A
58 | 
59 | def test_sample_to_2min_list():
60 |     from collections import defaultdict
61 |     def count(xs):
62 |         counts = defaultdict(int)
63 |         for x in xs:
64 |             counts[x] += 1
65 |         return counts
66 |     labels = ["A"]*8 + ["B"]*12 + ["C"]*16 + ["D"] * 24 + ["E"] * 1000
67 |     selected = sample_to_2min(labels)
68 |     before = count(labels)
69 |     after = count(np.array(labels)[selected])
70 |     assert max(after.values()) == min(before.values())*2
71 | 
72 | 
73 | def test_interval_normalise():
74 |     interval = milk.supervised.normalise.interval_normalise()
75 |     np.random.seed(105)
76 |     features = np.random.randn(100, 5)
77 |     model = interval.train(features, features[0] > 0)
78 |     transformed = np.array([model.apply(f) for f in features])
79 |     assert np.allclose(transformed.min(0), -1)
80 |     assert np.allclose(transformed.max(0), +1)
81 | 
82 | 
83 | 
84 | def test_nanstd():
85 |     from milk.unsupervised.normalise import _nanstd
86 |     np.random.seed(234)
87 |     for i in range(8):
88 |         x = np.random.rand(200,231)
89 |         np.allclose(_nanstd(x,0), x.std(0))
90 |         np.allclose(_nanstd(x,1), x.std(1))
91 | 


--------------------------------------------------------------------------------
/milk/tests/test_normaliselabels.py:
--------------------------------------------------------------------------------
 1 | from milk.supervised.normalise import normaliselabels
 2 | import numpy as np
 3 | 
 4 | def test_normaliselabels():
 5 |     np.random.seed(22)
 6 |     labels = np.zeros(120, np.uint8)
 7 |     labels[40:] += 1
 8 |     labels[65:] += 1
 9 |     reorder = np.argsort(np.random.rand(len(labels)))
10 |     labels = labels[reorder]
11 |     labels2,names = normaliselabels(labels)
12 |     for new_n,old_n in enumerate(names):
13 |         assert np.all( (labels == old_n) == (labels2 == new_n) )
14 | 
15 | def test_normaliselabels_multi():
16 |     np.random.seed(30)
17 |     r = np.random.random
18 |     for v in range(10):
19 |         labels = []
20 |         p = np.array([.24,.5,.1,.44])
21 |         for i in range(100):
22 |             cur = [j for j in range(4) if r() < p[j]]
23 |             if not cur: cur = [0]
24 |             labels.append(cur)
25 |         nlabels, names = normaliselabels(labels, True)
26 |         assert len(labels) == len(nlabels)
27 |         assert len(nlabels[0]) == max(list(map(max,labels)))+1
28 |         assert nlabels.sum() == sum(map(len,labels))
29 | 
30 | 


--------------------------------------------------------------------------------
/milk/tests/test_parzen.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import milk.supervised.normalise
 3 | from milk.supervised.parzen import get_parzen_rbf_loocv
 4 | import numpy as np
 5 | import milksets
 6 | 
 7 | def _slow_parzen(features, labels, sigma):
 8 |     correct = 0
 9 |     N = len(features)
10 |     labels = 2*labels - 1
11 |     def kernel(fi, fj):
12 |         return np.exp(-((fi-fj)**2).sum()/sigma)
13 |     for i in range(N):
14 |         C = 0.
15 |         for j in range(N):
16 |             if i == j: continue
17 |             C += labels[j] * kernel(features[i],features[j])
18 |         if (C*labels[i] > 0): correct += 1
19 |     return correct/N
20 | 
21 | def test_parzen():
22 |     features,labels = milksets.wine.load()
23 |     labels = (labels == 1)
24 |     features = milk.supervised.normalise.zscore(features)
25 |     f = get_parzen_rbf_loocv(features, labels)
26 |     sigmas = 2.**np.arange(-4,4)
27 |     for s in sigmas:
28 |         assert abs(_slow_parzen(features, labels, s) - f(s)) < 1e-6
29 | 


--------------------------------------------------------------------------------
/milk/tests/test_pca.py:
--------------------------------------------------------------------------------
 1 | import numpy.random
 2 | import milk.unsupervised.pca
 3 | import numpy as np
 4 | 
 5 | def test_pca():
 6 |     numpy.random.seed(123)
 7 |     X = numpy.random.rand(10,4)
 8 |     X[:,1] += numpy.random.rand(10)**2*X[:,0] 
 9 |     X[:,1] += numpy.random.rand(10)**2*X[:,0] 
10 |     X[:,2] += numpy.random.rand(10)**2*X[:,0] 
11 |     Y,V = milk.unsupervised.pca(X)
12 |     Xn = milk.unsupervised.normalise.zscore(X)
13 |     assert X.shape == Y.shape
14 |     assert ((np.dot(V[:4].T,Y[:,:4].T).T-Xn)**2).sum()/(Xn**2).sum() < .3
15 | 
16 | def test_mds():
17 |     from milk.unsupervised import pdist
18 |     np.random.seed(232)
19 |     for _ in range(12):
20 |         features = np.random.random_sample((12,4))
21 |         X = milk.unsupervised.mds(features,4)
22 |         D = pdist(features)
23 |         D2 = pdist(X)
24 |         assert np.mean( (D - D2) ** 2) < 10e-4
25 | 
26 | 
27 | def test_mds_dists():
28 |     from milk.unsupervised import pdist
29 |     np.random.seed(232)
30 |     for _ in range(12):
31 |         features = np.random.random_sample((12,4))
32 |         D = pdist(features)
33 |         X = milk.unsupervised.mds(features,4)
34 |         X2 = milk.unsupervised.mds_dists(D, 4)
35 |         assert np.mean( (X - X2) ** 2) < 10e-4
36 | 
37 | 
38 | 
39 | def test_mds_list():
40 |     from milk.unsupervised.pca import mds
41 |     data = np.random.random((128,16))
42 |     V  = mds(data,2)
43 |     V2 = mds(list(data),2)
44 |     assert np.all(V == V2)
45 | 
46 | def test_mds_regression_eig_order():
47 |     from milk.unsupervised.pca import mds_dists
48 |     # This was part of a much larger computation, but this isolated the bug:
49 |     dists = np.array([[
50 |                   0.        ,  377241.01101501,  390390.47006156,
51 |              340764.02535826,  421258.30020762,  470960.15365819,
52 |              331864.64507197,  213029.60122458,  306976.87583849],
53 |            [ 377241.01101501,       0.        ,  159390.25449606,
54 |              140506.60640227,  140922.67044651,  221684.10621381,
55 |              130161.14561428,  224134.4629224 ,  225617.6525412 ],
56 |            [ 390390.47006156,  159390.25449606,       0.        ,
57 |              188417.11617804,  192114.58972062,  238026.3963446 ,
58 |              159070.76483779,  242792.81436928,  228843.70200362],
59 |            [ 340764.02535826,  140506.60640227,  188417.11617804,
60 |                   0.        ,  247098.49216397,  265783.27794352,
61 |              161672.29500768,  170503.64299615,  171360.11464776],
62 |            [ 421258.30020762,  140922.67044651,  192114.58972062,
63 |              247098.49216397,       0.        ,  246385.36543382,
64 |              153380.00248566,  276707.33890808,  276009.04198403],
65 |            [ 470960.15365819,  221684.10621381,  238026.3963446 ,
66 |              265783.27794352,  246385.36543382,       0.        ,
67 |              252609.80940353,  327987.54137854,  308492.70255307],
68 |            [ 331864.64507197,  130161.14561428,  159070.76483779,
69 |              161672.29500768,  153380.00248566,  252609.80940353,
70 |                   0.        ,  179275.66833105,  192598.94271197],
71 |            [ 213029.60122458,  224134.4629224 ,  242792.81436928,
72 |              170503.64299615,  276707.33890808,  327987.54137854,
73 |              179275.66833105,       0.        ,  117004.41340669],
74 |            [ 306976.87583849,  225617.6525412 ,  228843.70200362,
75 |              171360.11464776,  276009.04198403,  308492.70255307,
76 |              192598.94271197,  117004.41340669,       0.        ]])
77 |     V = milk.unsupervised.mds_dists(dists, 2)
78 |     assert V[:,1].ptp() > 1.
79 | 


--------------------------------------------------------------------------------
/milk/tests/test_pdist.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from milk.unsupervised import pdist, plike
 3 | 
 4 | def test_pdist():
 5 |     np.random.seed(222)
 6 |     X = np.random.randn(100,23)
 7 |     Y = np.random.randn(80,23)
 8 |     Dxx = pdist(X)
 9 |     for i in range(X.shape[0]):
10 |         for j in range(X.shape[1]):
11 |             assert np.allclose(Dxx[i,j], np.sum((X[i]-X[j])**2))
12 | 
13 |     Dxy = pdist(X,Y)
14 |     for i in range(X.shape[0]):
15 |         for j in range(Y.shape[1]):
16 |             assert np.allclose(Dxy[i,j], np.sum((X[i]-Y[j])**2))
17 |     Dxye = pdist(X, Y, 'euclidean')
18 |     assert np.allclose(Dxye, np.sqrt(Dxy))
19 | 
20 | def test_plike():
21 |     np.random.seed(222)
22 |     X = np.random.randn(100,23)
23 |     Lxx = plike(X)
24 |     assert len(Lxx) == len(Lxx.T)
25 |     Lxx2 = plike(X, sigma2=.001)
26 |     assert Lxx[0,1] != Lxx2[0,1]
27 |     assert Lxx[0,0] == Lxx2[0,0]
28 | 


--------------------------------------------------------------------------------
/milk/tests/test_perceptron.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from milk.supervised.perceptron import perceptron_learner
 3 | from milk.supervised import _perceptron
 4 | from milksets.yeast import load
 5 | 
 6 | def test_raw():
 7 |     np.random.seed(23)
 8 |     data = np.random.random((100,10))
 9 |     data[50:] += .5
10 |     labels = np.repeat((0,1), 50)
11 |     weights = np.zeros((11))
12 |     eta = 0.1
13 |     for i in range(20):
14 |         _perceptron.perceptron(data, labels, weights, eta)
15 |     errs =  _perceptron.perceptron(data, labels, weights, eta)
16 |     assert errs < 10
17 | 
18 | def test_wrapper():
19 |     features,labels = load()
20 |     labels = (labels >= 5)
21 | 
22 |     learner = perceptron_learner()
23 |     model = learner.train(features, labels)
24 |     test = list(map(model.apply, features))
25 |     assert np.mean(labels != test) < .35
26 | 


--------------------------------------------------------------------------------
/milk/tests/test_precluster_learner.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from milk.supervised.precluster import precluster_learner, select_precluster
 3 | from milk.tests.fast_classifier import fast_classifier
 4 | 
 5 | def c0():
 6 |     return np.random.rand(8)
 7 | def c1():
 8 |     return c0()+2.*np.ones(8)
 9 | 
10 | def gen_data(seed, with_nums=False):
11 |     np.random.seed(seed)
12 | 
13 |     features = []
14 |     labels =[]
15 |     for i in range(200):
16 |         f = []
17 |         for j in range(40):
18 |             use_0 = (i < 100 and j < 30) or (i >= 100 and j >= 30)
19 |             if use_0: f.append(c0())
20 |             else: f.append(c1())
21 |         labels.append((i < 100))
22 |         if with_nums:
23 |             features.append((f,[]))
24 |         else:
25 |             features.append(f)
26 |     return features, labels
27 | 
28 | 
29 | def test_precluster():
30 |     learner = precluster_learner([2], base=fast_classifier(), R=12)
31 |     features, labels = gen_data(22)
32 |     model = learner.train(features,labels)
33 | 
34 |     assert model.apply([c0() for i in range(35)])
35 |     assert not model.apply([c1() for i in range(35)])
36 | 
37 | def test_codebook_learner():
38 |     learner = select_precluster([2,3,4], base=fast_classifier())
39 |     learner.rmax = 3
40 |     features, labels = gen_data(23, 1)
41 |     model = learner.train(features,labels)
42 | 
43 |     assert model.apply(([c0() for i in range(35)],[]))
44 |     assert not model.apply(([c1() for i in range(35)],[]))
45 | 
46 | def test_codebook_learner_case1():
47 |     learner = select_precluster([2], base=fast_classifier())
48 |     learner.rmax = 1
49 |     features, labels = gen_data(23, 1)
50 |     model = learner.train(features,labels)
51 | 
52 |     assert model.apply(([c0() for i in range(35)],[]))
53 |     assert not model.apply(([c1() for i in range(35)],[]))
54 | 
55 | 


--------------------------------------------------------------------------------
/milk/tests/test_regression.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import numpy as np
 3 | import milk.supervised._svm
 4 | from gzip import GzipFile
 5 | from os import path
 6 | from milksets.wine import load
 7 | from milk.supervised import defaultclassifier
 8 | import milk
 9 | 
10 | def test_svm_crash():
11 |     from sys import version_info
12 |     if version_info.major >= 3:
13 |         pickle_load = lambda f: pickle.load(f, encoding='latin1')
14 |     else:
15 |         pickle_load = pickle.load
16 | 
17 |     X,Y,kernel, C, eps ,tol, = pickle_load(GzipFile(path.dirname(__file__) + '/data/regression-2-Dec-2009.pp.gz'))
18 |     X = X[2:-2,:].copy()
19 |     Y = Y[2:-2].copy()
20 |     N = len(Y)
21 |     Y = Y.astype(np.int32)
22 |     p = -np.ones(N,np.double)
23 |     params = np.array([0,C,eps,tol],np.double)
24 |     Alphas0 = np.zeros(N, np.double)
25 |     cache_size = (1<<20)
26 |     # The line below crashed milk:
27 |     milk.supervised._svm.eval_LIBSVM(X,Y,Alphas0,p,params,kernel,cache_size)
28 |     # HASN'T CRASHED!
29 | 
30 | 
31 | def test_nov2010():
32 |     # Bug submitted by Mao Ziyang
33 |     # This was failing in 0.3.5 because SDA selected no features
34 |     np.random.seed(222)
35 |     features = np.random.randn(100,20)
36 |     features[:50] *= 2
37 |     labels = np.repeat((0,1), 50)
38 | 
39 |     classifier = milk.defaultclassifier()
40 |     model = classifier.train(features, labels)
41 |     new_label = model.apply(np.random.randn(20)*2)
42 |     new_label2 = model.apply(np.random.randn(20))
43 |     assert new_label == 0
44 |     assert new_label2 == 1
45 | 
46 | def test_default_small():
47 |     features, labels = load()
48 |     selected = np.concatenate( [np.where(labels < 2)[0], np.where(labels == 2)[0][:6]] )
49 |     features = features[selected]
50 |     labels = labels[selected]
51 |     learner = defaultclassifier('fast')
52 |     # For version 0.3.8, the line below led to an error
53 |     milk.nfoldcrossvalidation(features, labels, classifier=learner)
54 | 
55 | 


--------------------------------------------------------------------------------
/milk/tests/test_regression_constant_features.py:
--------------------------------------------------------------------------------
 1 | import milk
 2 | import numpy as np
 3 | def test_constant_features():
 4 |     learner = milk.defaultclassifier()
 5 |     features = np.ones(20).reshape((-1,1))
 6 |     labels = np.zeros(20)
 7 |     labels[10:] += 1
 8 |     features[10:] *= -1
 9 |     learner.train(features, labels)
10 | 
11 | 


--------------------------------------------------------------------------------
/milk/tests/test_rf.py:
--------------------------------------------------------------------------------
 1 | from milk.supervised import randomforest
 2 | import numpy as np
 3 | 
 4 | def test_rf():
 5 |     from milksets import wine
 6 |     features, labels = wine.load()
 7 |     features = features[labels < 2]
 8 |     labels = labels[labels < 2]
 9 |     learner = randomforest.rf_learner()
10 |     model = learner.train(features[::5], labels[::5])
11 |     test = [model.apply(f) for f in features]
12 |     assert np.mean(labels == test) > .7
13 | 
14 | 


--------------------------------------------------------------------------------
/milk/tests/test_set2binary_array.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from milk.supervised import set2binary_array
 3 | 
 4 | def test_set2binary_array_len():
 5 |     s2f = set2binary_array.set2binary_array()
 6 |     inputs = [ np.arange(1,3)*2, np.arange(4)**2, np.arange(6)+2 ]
 7 |     labels = [0,0,1]
 8 |     model = s2f.train(inputs,labels)
 9 |     assert len(model.apply(inputs[0])) == len(model.apply(inputs[1]))
10 |     assert len(model.apply(inputs[0])) == len(model.apply(inputs[2]))
11 |     assert len(model.apply(inputs[0])) == len(model.apply(list(range(128))))
12 | 
13 | 


--------------------------------------------------------------------------------
/milk/tests/test_som.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | from milk.unsupervised import som
 4 | from milk.unsupervised.som import putpoints, closest
 5 | 
 6 | 
 7 | def _slow_putpoints(grid, points, L=.2):
 8 |     for point in points:
 9 |         dpoint = grid-point
10 |         y,x = np.unravel_index(np.abs(dpoint).argmin(), dpoint.shape)
11 |         for dy in range(-4, +4):
12 |             for dx in range(-4, +4):
13 |                 ny = y + dy
14 |                 nx = x + dx
15 |                 if ny < 0 or ny >= grid.shape[0]:
16 |                     continue
17 |                 if nx < 0 or nx >= grid.shape[1]:
18 |                     continue
19 |                 L2 = L/(1+np.abs(dy)+np.abs(dx))
20 |                 grid[ny,nx] *= 1. - L2
21 |                 grid[ny,nx] += point*L2
22 | 
23 | 
24 | def data_grid():
25 |     np.random.seed(22)
26 |     data = np.arange(100000, dtype=np.float32)
27 |     grid = np.array([data.flat[np.random.randint(0, data.size)] for i in range(64*64)]).reshape((64,64,1))
28 |     data = data.reshape((-1,1))
29 |     return grid, data
30 | 
31 | def test_putpoints():
32 |     grid, points = data_grid()
33 |     points = points[:100]
34 |     grid2 = grid.copy()
35 |     putpoints(grid, points, L=0., R=1)
36 |     assert np.all(grid == grid2)
37 |     putpoints(grid, points, L=.5, R=1)
38 |     assert not np.all(grid == grid2)
39 | 
40 | def test_against_slow():
41 |     grid, points = data_grid()
42 |     grid2 = grid.copy()
43 |     putpoints(grid, points[:10], shuffle=False)
44 |     _slow_putpoints(grid2.reshape((64,64)), points[:10])
45 |     assert np.allclose(grid, grid2)
46 | 
47 | 
48 | def test_som():
49 |     N = 10000
50 |     np.random.seed(2)
51 |     data = np.array([np.arange(N), N/4.*np.random.randn(N)])
52 |     data = data.transpose().copy()
53 |     grid = som(data, (8,8), iterations=3, R=4)
54 |     assert grid.shape == (8,8,2)
55 |     y,x = closest(grid, data[0])
56 |     assert 0 <= y < grid.shape[0]
57 |     assert 0 <= x < grid.shape[1]
58 | 
59 |     grid2 = grid.copy()
60 |     np.random.shuffle(grid2)
61 |     full = np.abs(np.diff(grid2[:,:,0], axis=0)).mean()
62 |     obs = np.abs(np.diff(grid[:,:,0], axis=0)).mean()
63 |     obs2 = np.abs(np.diff(grid[:,:,0], axis=1)).mean()
64 |     assert obs + 4*np.abs(obs-obs2) < full
65 | 
66 | 


--------------------------------------------------------------------------------
/milk/tests/test_svm_sigmoidal.py:
--------------------------------------------------------------------------------
  1 | import numpy 
  2 | from milk.supervised import svm
  3 | import numpy
  4 | import numpy as np
  5 | 
  6 | def old_learn_sigmoid_constants(F,Y,
  7 |             max_iters=None,
  8 |             min_step=1e-10,
  9 |             sigma=1e-12,
 10 |             eps=1e-5):
 11 |     '''
 12 |     Old version. Direct C-like implementation
 13 |     '''
 14 |     # the deci[i] array is called F[i] in this code
 15 |     F = np.asanyarray(F)
 16 |     Y = np.asanyarray(Y)
 17 |     assert len(F) == len(Y)
 18 |     assert numpy.all( (Y == 1) | (Y == 0) )
 19 |     from numpy import log, exp
 20 |     N=len(F)
 21 |     if max_iters is None: max_iters = 1000
 22 | 
 23 |     prior1 = Y.sum()
 24 |     prior0 = N-prior1
 25 | 
 26 |     small_nr = 1e-4
 27 | 
 28 |     hi_t = (prior1+1.)/(prior1+2.)
 29 |     lo_t = 1./(prior0+2.)
 30 | 
 31 |     T = Y*hi_t + (1-Y)*lo_t
 32 | 
 33 |     A = 0.
 34 |     B = log( (prior0+1.)/(prior1+1.) )
 35 |     def target(A,B):
 36 |         fval = 0.
 37 |         for i in range(N):
 38 |             fApB = F[i]*A+B
 39 |             if fApB >= 0:
 40 |                 fval += T[i]*fApB+log(1+exp(-fApB))
 41 |             else:
 42 |                 fval += (T[i]-1.)*fApB+log(1+exp(fApB))
 43 |         return fval
 44 |     fval = target(A,B)
 45 |     for iter in range(max_iters):
 46 |         h11=sigma
 47 |         h22=sigma
 48 |         h21=0.
 49 |         g1=0.
 50 |         g2=0.
 51 |         for i in range(N):
 52 |             fApB = F[i]*A+B
 53 |             if (fApB >= 0):
 54 |                 p = exp(-fApB)/(1.+exp(-fApB))
 55 |                 q = 1./(1.+exp(-fApB))
 56 |             else:
 57 |                 p = 1./(1.+exp(fApB))
 58 |                 q = exp(fApB)/(1.+exp(fApB))
 59 |             d2 = p * q
 60 |             h11 += F[i]*F[i]*d2
 61 |             h22 += d2
 62 |             h21 += F[i]*d2
 63 |             d1 = T[i] - p
 64 |             g1 += F[i]*d1
 65 |             g2 += d1
 66 |         if abs(g1) < eps and abs(g2) < eps: # Stopping criteria
 67 |             break
 68 |         
 69 |         det = h11*h22 - h21*h21
 70 |         dA = - (h22*g1 - h21*g2)/det
 71 |         dB = - (h21*g1 + h11*g2)/det
 72 |         gd = g1*dA + g2*dB
 73 |         stepsize = 1.
 74 | 
 75 |         while stepsize >= min_step:
 76 |             newA = A + stepsize*dA
 77 |             newB = B + stepsize*dB
 78 |             newf = target(newA,newB)
 79 |             if newf < fval+eps*stepsize*gd:
 80 |                 A = newA
 81 |                 B = newB
 82 |                 fval = newf
 83 |                 break
 84 |             stepsize /= 2
 85 |         if stepsize < min_step:
 86 |             break
 87 |     return A,B
 88 | 
 89 | 
 90 | def test_learn_sigmoid_contants():
 91 |     Y = np.repeat((0,1),100)
 92 |     np.random.seed(3)
 93 |     for i in range(10):
 94 |         F = np.random.rand(200)-.3
 95 |         F[100:] *= -1
 96 |         old = old_learn_sigmoid_constants(F,Y)
 97 |         new = svm.learn_sigmoid_constants(F,Y)
 98 |         assert np.allclose(old, new)
 99 | 
100 | 


--------------------------------------------------------------------------------
/milk/tests/test_tree.py:
--------------------------------------------------------------------------------
 1 | import milk.supervised.tree
 2 | import milk.supervised._tree
 3 | from milk.supervised._tree import set_entropy
 4 | from milk.supervised.tree import information_gain, stump_learner
 5 | import numpy as np
 6 | 
 7 | def test_tree():
 8 |     from milksets import wine
 9 |     features, labels = wine.load()
10 |     selected = (labels < 2)
11 |     features = features[selected]
12 |     labels = labels[selected]
13 |     C = milk.supervised.tree.tree_classifier()
14 |     model = C.train(features,labels)
15 |     assert (np.array([model.apply(f) for f in features]) == labels).mean() > .5
16 | 
17 | 
18 | def test_split_subsample():
19 |     import random
20 |     from milksets import wine
21 |     features, labels = wine.load()
22 |     labels = labels.astype(np.int)
23 | 
24 |     seen = set()
25 |     for i in range(20):
26 |         random.seed(2)
27 |         i,s = milk.supervised.tree._split(features[::10], labels[::10], None, milk.supervised.tree.information_gain, 2, random)
28 |         seen.add(i)
29 |     assert len(seen) <= 2
30 | 
31 | 
32 | def test_set_entropy():
33 |     labels = np.arange(101)%3
34 |     counts = np.zeros(3)
35 |     entropy = milk.supervised._tree.set_entropy(labels, counts)
36 |     slow_counts = np.array([(labels == i).sum() for i in range(3)])
37 |     assert np.all(counts == slow_counts)
38 |     px = slow_counts.astype(float)/ slow_counts.sum()
39 |     slow_entropy = - np.sum(px * np.log(px))
40 |     assert np.abs(slow_entropy - entropy) < 1.e-8
41 | 
42 | 
43 | def slow_information_gain(labels0, labels1):
44 |     H = 0.
45 |     N = len(labels0) + len(labels1)
46 |     nlabels = 1+max(labels0.max(), labels1.max())
47 |     counts = np.empty(nlabels, np.double)
48 |     for arg in (labels0, labels1):
49 |         H -= len(arg)/float(N) * set_entropy(arg, counts)
50 |     return H
51 | 
52 | def test_information_gain():
53 |     np.random.seed(22)
54 |     for i in range(8):
55 |         labels0 = (np.random.randn(20) > .2).astype(int)
56 |         labels1 = (np.random.randn(33) > .8).astype(int)
57 |         fast = information_gain(labels0, labels1)
58 |         slow = slow_information_gain(labels0, labels1)
59 |         assert np.abs(fast - slow) < 1.e-8
60 | 
61 | 
62 | def test_information_gain_small():
63 |     labels1 = np.array([0])
64 |     labels0 = np.array([0, 1])
65 |     assert information_gain(labels0, labels1) < 0.
66 | 
67 | 
68 | def test_z1_loss():
69 |     from milk.supervised.tree import z1_loss
70 |     L0 = np.zeros(10)
71 |     L1 = np.ones(10)
72 |     L1[3] = 0
73 |     W0 = np.ones(10)
74 |     W1 = np.ones(10)
75 |     assert z1_loss(L0, L1) == z1_loss(L0, L1, W0, W1)
76 |     assert z1_loss(L0, L1) != z1_loss(L0, L1, W0, .8*W1)
77 |     assert z1_loss(L0, L1) > 0
78 | 
79 | 
80 | def test_stump_learner():
81 |     learner = stump_learner()
82 |     np.random.seed(111)
83 |     for i in range(8):
84 |         features = np.random.random_sample((40,2))
85 |         features[:20,0] += .5
86 |         labels = np.repeat((0,1),20)
87 |         model = learner.train(features, labels, normalisedlabels=True)
88 |         assert not model.apply([0.01,.5])
89 |         assert model.apply(np.random.random_sample(2)+.8)
90 |         assert model.idx == 0
91 | 
92 | 


--------------------------------------------------------------------------------
/milk/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from milk.utils.utils import get_nprandom, get_pyrandom
 3 | 
 4 | def test_nprandom():
 5 |     assert get_nprandom(None).rand() != get_nprandom(None).rand()
 6 |     assert get_nprandom(1).rand() != get_nprandom(2).rand()
 7 |     assert get_nprandom(1).rand() == get_nprandom(1).rand()
 8 |     r =  get_nprandom(1)
 9 |     assert get_nprandom(r).rand() != r.rand()
10 | 
11 | def test_pyrandom():
12 |     assert get_pyrandom(None).random() != get_pyrandom(None).random()
13 |     assert get_pyrandom(1).random() != get_pyrandom(2).random()
14 |     assert get_pyrandom(1).random() == get_pyrandom(1).random()
15 |     r =  get_pyrandom(1)
16 |     assert get_pyrandom(r).random() != r.random()
17 | 
18 | def test_cross_random():
19 |     assert get_pyrandom(get_nprandom(1)).random() == get_pyrandom(get_nprandom(1)).random()
20 |     assert get_nprandom(get_pyrandom(1)).rand() == get_nprandom(get_pyrandom(1)).rand()
21 | 
22 | def test_recursive():
23 |     def recurse(f):
24 |         R = f(None)
25 |         assert f(R) is R
26 |     yield recurse, get_pyrandom
27 |     yield recurse, get_nprandom
28 | 
29 | 


--------------------------------------------------------------------------------
/milk/unsupervised/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2008-2013, Luis Pedro Coelho <luis@luispedro.org>
 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 4 | #
 5 | # License: MIT. See COPYING.MIT file in the milk distribution
 6 | 
 7 | '''
 8 | milk.unsupervised
 9 | 
10 | Unsupervised Learning
11 | ---------------------
12 | 
13 | - kmeans: This is a highly optimised implementation of kmeans
14 | - PCA: Simple implementation
15 | - Non-negative matrix factorisation: both direct and with sparsity constraints
16 | '''
17 | 
18 | from .kmeans import kmeans,repeated_kmeans, select_best_kmeans
19 | from .gaussianmixture import *
20 | from .pca import pca, mds, mds_dists
21 | from . import nnmf
22 | from .nnmf import *
23 | from .pdist import pdist, plike
24 | from .som import som
25 | from .normalise import zscore, center
26 | 
27 | __all__ = [
28 |     'center',
29 |     'kmeans',
30 |     'mds',
31 |     'mds_dists',
32 |     'pca',
33 |     'pdist',
34 |     'plike',
35 |     'repeated_kmeans',
36 |     'select_best_kmeans',
37 |     'som',
38 |     'zscore',
39 |     ] + \
40 |     nnmf.__all__
41 | 


--------------------------------------------------------------------------------
/milk/unsupervised/_som.cpp:
--------------------------------------------------------------------------------
  1 | #include <limits>
  2 | #include <iostream>
  3 | #include <cstdlib>
  4 | #include "../utils/utils.h"
  5 | 
  6 | extern "C" {
  7 |     #include <Python.h>
  8 |     #include <numpy/ndarrayobject.h>
  9 | }
 10 | 
 11 | namespace {
 12 | struct SOM_Exception {
 13 |     SOM_Exception(const char* msg): msg(msg) { }
 14 |     const char* msg;
 15 | 
 16 | };
 17 | void assert_type_contiguous(PyArrayObject* array,int type) { 
 18 |     if (!PyArray_Check(array) ||
 19 |         PyArray_TYPE(array) != type ||
 20 |         !PyArray_ISCONTIGUOUS(array)) {
 21 |         throw SOM_Exception("Arguments to putpoints don't conform to expectation. Are you calling this directly? This is an internal function!");
 22 |     }
 23 | }
 24 | 
 25 | void putpoints(PyArrayObject* grid, PyArrayObject* points, float L, int radius) {
 26 |     if (PyArray_NDIM(grid) != 3) throw SOM_Exception("grid should be three dimensional");
 27 |     if (PyArray_NDIM(points) != 2) throw SOM_Exception("points should be two dimensional");
 28 |     const int rows = PyArray_DIM(grid, 0);
 29 |     const int cols = PyArray_DIM(grid, 1);
 30 |     const int d = PyArray_DIM(grid, 2);
 31 |     const int n = PyArray_DIM(points, 0);
 32 |     if (PyArray_DIM(points, 1) != d) throw SOM_Exception("second dimension of points is not third dimension of grid");
 33 | 
 34 |     Py_BEGIN_ALLOW_THREADS
 35 | 
 36 |     for (int i = 0; i != n; i++){
 37 |         const float* p = static_cast<float*>(PyArray_GETPTR1(points, i));
 38 |         int min_y = 0;
 39 |         int min_x = 0;
 40 |         float best = std::numeric_limits<float>::max();
 41 |         for (int y = 0; y != rows; ++y) {
 42 |             for (int x = 0; x != cols; ++x) {
 43 |                 float dist = 0.;
 44 |                 const float* gpoint = static_cast<float*>(PyArray_GETPTR2(grid, y, x));
 45 |                 for (int j = 0; j != d; ++j) {
 46 |                     dist += (p[j] - gpoint[j])*(p[j] - gpoint[j]);
 47 |                 }
 48 |                 if (dist < best) {
 49 |                     best = dist;
 50 |                     min_y = y;
 51 |                     min_x = x;
 52 |                 }
 53 |             }
 54 |         }
 55 |         const int start_y = std::max(0, min_y - radius);
 56 |         const int start_x = std::max(0, min_x - radius);
 57 |         const int end_y = std::min(rows, min_y + radius);
 58 |         const int end_x = std::min(rows, min_x + radius);
 59 |         
 60 |         for (int y = start_y; y != end_y; ++y) {
 61 |             for (int x = start_x; x != end_x; ++x) {
 62 |                 const float L2 = L /(1 + std::abs(min_y - y) + std::abs(min_x - x));
 63 |                 float* gpoint = static_cast<float*>(PyArray_GETPTR2(grid, y, x));
 64 |                 for (int j = 0; j != d; ++j) {
 65 |                     gpoint[j] *= (1.-L2);
 66 |                     gpoint[j] += L2 * p[j];
 67 |                 }
 68 |             }
 69 |         }
 70 |     }
 71 |     Py_END_ALLOW_THREADS
 72 | }
 73 | 
 74 | 
 75 | PyObject* py_putpoints(PyObject* self, PyObject* args) {
 76 |     try {
 77 |         PyArrayObject* grid; 
 78 |         PyArrayObject* points;
 79 |         float L;
 80 |         int radius;
 81 |         if (!PyArg_ParseTuple(args, "OOfi", &grid, &points, &L, &radius)) {
 82 |             const char* errmsg = "Arguments were not what was expected for putpoints.\n" 
 83 |                                 "This is an internal function: Do not call directly unless you know exactly what you're doing.\n";
 84 |             PyErr_SetString(PyExc_RuntimeError,errmsg);
 85 |             return 0;
 86 |         }
 87 |         assert_type_contiguous(grid, NPY_FLOAT);
 88 |         assert_type_contiguous(points, NPY_FLOAT);
 89 |         putpoints(grid, points, L, radius);
 90 | 
 91 |         Py_RETURN_NONE;
 92 |     } catch (const SOM_Exception& exc) {
 93 |         PyErr_SetString(PyExc_RuntimeError,exc.msg);
 94 |         return 0;
 95 |     } catch (...) {
 96 |         PyErr_SetString(PyExc_RuntimeError,"Some sort of exception in putpoints.");
 97 |         return 0;
 98 |     }
 99 | }
100 | 
101 | PyMethodDef methods[] = {
102 |   {"putpoints", py_putpoints, METH_VARARGS , "Do NOT call directly.\n" },
103 |   {NULL, NULL,0,NULL},
104 | };
105 | 
106 | const char  * module_doc = 
107 |     "Internal SOM Module.\n"
108 |     "\n"
109 |     "Do NOT use directly!\n";
110 | 
111 | } // namespace
112 | 
113 | 
114 | DECLARE_MODULE(_som)
115 | 
116 | 


--------------------------------------------------------------------------------
/milk/unsupervised/affinity.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
  3 | # Copyright (C) 2010-2011,
  4 | #       Luis Pedro Coelho <luis@luispedro.org>,
  5 | #       Alexandre Gramfort <alexandre.gramfort@inria.fr>,
  6 | #       Gael Varoquaux <gael.varoquaux@normalesup.org>
  7 | #
  8 | # License: MIT. See COPYING.MIT file in the milk distribution
  9 | """Affinity propagation
 10 | 
 11 | Original Authors (for scikits.learn):
 12 |         Alexandre Gramfort alexandre.gramfort@inria.fr
 13 |         Gael Varoquaux gael.varoquaux@normalesup.org
 14 | 
 15 | Luis Pedro Coelho made the implementation more careful about allocating
 16 | intermediate arrays.
 17 | """
 18 | 
 19 | import numpy as np
 20 | 
 21 | __all__ = [
 22 |     'affinity_propagation',
 23 |     ]
 24 | 
 25 | def affinity_propagation(S, p=None, convit=30, maxit=200, damping=0.5, copy=True, R=0):
 26 |     """Perform Affinity Propagation Clustering of data
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     S : array [n_points, n_points]
 31 |         Matrix of similarities between points
 32 |     p : array [n_points,] or float, optional
 33 |         Preferences for each point
 34 |     damping : float, optional
 35 |         Damping factor
 36 |     copy : boolean, optional
 37 |         If copy is False, the affinity matrix is modified inplace by the
 38 |         algorithm, for memory efficiency
 39 |     R : source of randomness
 40 | 
 41 |     Returns
 42 |     -------
 43 | 
 44 |     cluster_centers_indices : array [n_clusters]
 45 |         index of clusters centers
 46 | 
 47 |     labels : array [n_points]
 48 |         cluster labels for each point
 49 | 
 50 |     Notes
 51 |     -----
 52 |     See examples/plot_affinity_propagation.py for an example.
 53 | 
 54 |     Reference:
 55 |     Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
 56 |     Between Data Points", Science Feb. 2007
 57 | 
 58 |     """
 59 |     if copy:
 60 |         # Copy the affinity matrix to avoid modifying it inplace
 61 |         S = S.copy()
 62 | 
 63 |     n_points = S.shape[0]
 64 | 
 65 |     assert S.shape[0] == S.shape[1]
 66 | 
 67 |     if p is None:
 68 |         p = np.median(S)
 69 | 
 70 |     if damping < 0.5 or damping >= 1:
 71 |         raise ValueError('damping must be >= 0.5 and < 1')
 72 | 
 73 |     random_state = np.random.RandomState(R)
 74 | 
 75 |     # Place preferences on the diagonal of S
 76 |     S.flat[::(n_points+1)] = p
 77 | 
 78 |     A = np.zeros((n_points, n_points))
 79 |     R = np.zeros((n_points, n_points)) # Initialize messages
 80 | 
 81 |     # Remove degeneracies
 82 |     noise = random_state.randn(n_points, n_points)
 83 |     typeinfo = np.finfo(S.dtype)
 84 |     noise *= typeinfo.tiny*100
 85 |     S += noise
 86 |     del noise
 87 | 
 88 |     # Execute parallel affinity propagation updates
 89 |     e = np.zeros((n_points, convit))
 90 | 
 91 |     ind = np.arange(n_points)
 92 | 
 93 |     for it in range(maxit):
 94 |         Aold = A.copy()
 95 |         Rold = R.copy()
 96 |         A += S
 97 | 
 98 |         I = np.argmax(A, axis=1)
 99 |         Y = A[ind, I]#np.max(A, axis=1)
100 | 
101 |         A[ind, I] = typeinfo.min
102 | 
103 |         Y2 = np.max(A, axis=1)
104 |         R = S - Y[:, np.newaxis]
105 | 
106 |         R[ind, I[ind]] = S[ind, I] - Y2
107 | 
108 |         Rold *= damping
109 |         R *= (1-damping)
110 |         R += Rold
111 | 
112 |         # Compute availabilities
113 |         Rd = R.diagonal().copy()
114 |         np.maximum(R, 0, R)
115 |         R.flat[::n_points+1] = Rd
116 | 
117 |         A = np.sum(R, axis=0)[np.newaxis, :] - R
118 | 
119 |         dA = np.diag(A)
120 |         A = np.minimum(A, 0)
121 | 
122 |         A.flat[::n_points+1] = dA
123 | 
124 |         Aold *= damping
125 |         A *= (1-damping)
126 |         A += Aold
127 | 
128 |         # Check for convergence
129 |         E = (np.diag(A) + np.diag(R)) > 0
130 |         e[:, it % convit] = E
131 |         K = np.sum(E, axis=0)
132 | 
133 |         if it >= convit:
134 |             se = np.sum(e, axis=1);
135 |             unconverged = np.sum((se == convit) + (se == 0)) != n_points
136 |             if (not unconverged and (K>0)) or (it==maxit):
137 |                 print("Converged after %d iterations." % it)
138 |                 break
139 |     else:
140 |         print("Did not converge")
141 | 
142 |     I = np.where(np.diag(A+R) > 0)[0]
143 |     K = I.size # Identify exemplars
144 | 
145 |     if K > 0:
146 |         c = np.argmax(S[:, I], axis=1)
147 |         c[I] = np.arange(K) # Identify clusters
148 |         # Refine the final set of exemplars and clusters and return results
149 |         for k in range(K):
150 |             ii = np.where(c==k)[0]
151 |             j = np.argmax(np.sum(S[ii, ii], axis=0))
152 |             I[k] = ii[j]
153 | 
154 |         c = np.argmax(S[:, I], axis=1)
155 |         c[I] = np.arange(K)
156 |         labels = I[c]
157 |         # Reduce labels to a sorted, gapless, list
158 |         cluster_centers_indices = np.unique(labels)
159 |         labels = np.searchsorted(cluster_centers_indices, labels)
160 |     else:
161 |         labels = np.empty((n_points, 1))
162 |         cluster_centers_indices = None
163 |         labels.fill(np.nan)
164 | 
165 |     return cluster_centers_indices, labels
166 | 


--------------------------------------------------------------------------------
/milk/unsupervised/nnmf/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2008-2010, Luis Pedro Coelho <luis@luispedro.org>
 3 | # 
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | #  of this software and associated documentation files (the "Software"), to deal
 6 | #  in the Software without restriction, including without limitation the rights
 7 | #  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | #  copies of the Software, and to permit persons to whom the Software is
 9 | #  furnished to do so, subject to the following conditions:
10 | # 
11 | # The above copyright notice and this permission notice shall be included in
12 | #  all copies or substantial portions of the Software.
13 | # 
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | #  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | #  THE SOFTWARE.
21 | 
22 | from . import lee_seung as ls
23 | from .hoyer import sparse_nnmf
24 | from . import hoyer
25 | lee_seung = ls.nnmf
26 | 
27 | __all__ = ['lee_seung','sparse_nnmf']
28 | 
29 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
30 | 


--------------------------------------------------------------------------------
/milk/unsupervised/nnmf/hoyer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2008-2010, Luis Pedro Coelho <luis@luispedro.org>
  3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
  4 | # 
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | #  of this software and associated documentation files (the "Software"), to deal
  7 | #  in the Software without restriction, including without limitation the rights
  8 | #  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | #  copies of the Software, and to permit persons to whom the Software is
 10 | #  furnished to do so, subject to the following conditions:
 11 | # 
 12 | # The above copyright notice and this permission notice shall be included in
 13 | #  all copies or substantial portions of the Software.
 14 | # 
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | #  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | #  THE SOFTWARE.
 22 | 
 23 | 
 24 | import numpy as np
 25 | from ...utils import get_nprandom
 26 | 
 27 | __all__ = ['hoyer_sparse_nnmf']
 28 | 
 29 | 
 30 | def sp(s):
 31 |     L2 = np.sqrt(np.dot(s,s))
 32 |     L1 = np.abs(s).sum()
 33 |     sn = np.sqrt(len(s))
 34 |     return (sn-L1/L2)/(sn-1)
 35 | 
 36 | def _solve_alpha(s,m,L2):
 37 |     sm = s-m
 38 |     s2 = np.dot(s,s)
 39 |     sm2 = np.dot(sm, sm)
 40 |     m2 = np.dot(m, m)
 41 |     dot = np.dot(m, sm)
 42 |     alpha = (-dot + np.sqrt(dot**2 - sm2*(m2-L2**2)))/sm2
 43 |     return alpha
 44 | 
 45 | def _project(x,L1,L2):
 46 |     '''
 47 |     Implement projection onto sparse space
 48 |     '''
 49 |     x = np.asanyarray(x)
 50 |     n = len(x)
 51 | 
 52 |     s = x + (L1 - x.sum())/n
 53 |     Z = np.zeros(n,bool)
 54 |     while True:
 55 |         m = (~Z) * L1/(n-Z.sum())
 56 |         alpha = _solve_alpha(s,m,L2)
 57 |         s = m + alpha * (s - m)
 58 |         negs = (s < 0)
 59 |         if not negs.any():
 60 |             return s
 61 |         Z |= negs
 62 |         s[Z] = 0
 63 |         c = (s.sum() - L1)/(~Z).sum()
 64 |         s -= c*(~Z)
 65 | 
 66 | def _L1for(s,x,L2):
 67 |     '''
 68 |     Solve for L1 in
 69 | 
 70 |     s = [ sqrt(n) - L1/L2] / [sqrt(n) - 1]
 71 |     '''
 72 |     sn = np.sqrt(len(x))
 73 |     return L2*(s+sn-s*sn)
 74 | 
 75 | def sparse_nnmf(V, r, sparsenessW=None, sparsenessH=None, max_iter=10000, R=None):
 76 |     '''
 77 |     W,H = hoyer.sparse_nnmf(V, r, sparsenessW = None, sparsenessH = None, max_iter=10000, R=None)
 78 | 
 79 |     Implement sparse nonnegative matrix factorisation.
 80 | 
 81 |     Parameters
 82 |     ----------
 83 |     V : 2-D matrix
 84 |         input feature matrix
 85 |     r : integer
 86 |         number of latent features
 87 |     sparsenessW : double, optional
 88 |         sparseness contraint on W (default: no sparsity contraint)
 89 |     sparsenessH : double, optional
 90 |         sparseness contraint on H (default: no sparsity contraint)
 91 |     max_iter : integer, optional
 92 |         maximum nr of iterations (default: 10000)
 93 |     R : integer, optional
 94 |         source of randomness
 95 | 
 96 |     Returns
 97 |     -------
 98 |     W : 2-ndarray
 99 |     H : 2-ndarray
100 | 
101 |     Reference
102 |     ---------
103 |     "Non-negative Matrix Factorisation with Sparseness Constraints"
104 |     by Patrik Hoyer
105 |     in Journal of Machine Learning Research 5 (2004) 1457--1469
106 |     '''
107 |         
108 |     n,m = V.shape
109 |     R = get_nprandom(R)
110 |     mu_W = .15
111 |     mu_H = .15
112 |     eps = 1e-8
113 |     W = R.standard_normal((n,r))**2
114 |     H = R.standard_normal((r,m))**2
115 | 
116 |     def fix(X, sparseness):
117 |         for i in range(r):
118 |             row = X[i]
119 |             L2 = np.sqrt(np.dot(row, row))
120 |             row /= L2
121 |             X[i] = _project(row, _L1for(sparseness, row, 1.), 1.)
122 | 
123 |     def fixW():
124 |         fix(W.T, sparsenessW)
125 |     def fixH():
126 |         fix(H, sparsenessH)
127 | 
128 |     if sparsenessW is not None: fixW()
129 |     if sparsenessH is not None: fixH()
130 |     for i in range(max_iter):
131 |         if sparsenessW is not None:
132 |             W -= mu_W * np.dot(np.dot(W,H)-V,H.T)
133 |             fixW()
134 |         else:
135 |             updateW = np.dot(V,H.T)/(np.dot(W,np.dot(H,H.T))+eps)
136 |             W *= updateW
137 |         if sparsenessH is not None:
138 |             H -= mu_H * np.dot(W.T,np.dot(W,H)-V)
139 |             fixH()
140 |         else:
141 |             updateH = np.dot(W.T,V)/(np.dot(np.dot(W.T,W),H)+eps)
142 |             H *= updateH
143 |     return W,H
144 | 
145 | 


--------------------------------------------------------------------------------
/milk/unsupervised/nnmf/lee_seung.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2008-2010, Luis Pedro Coelho <luis@luispedro.org>
 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 4 | # 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | #  of this software and associated documentation files (the "Software"), to deal
 7 | #  in the Software without restriction, including without limitation the rights
 8 | #  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | #  copies of the Software, and to permit persons to whom the Software is
10 | #  furnished to do so, subject to the following conditions:
11 | # 
12 | # The above copyright notice and this permission notice shall be included in
13 | #  all copies or substantial portions of the Software.
14 | # 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | #  THE SOFTWARE.
22 | 
23 | from __future__ import division
24 | import numpy as np
25 | from numpy import dot
26 | from ...utils import get_nprandom
27 | 
28 | __all__ = ['nnmf']
29 | 
30 | def nnmf(V, r, cost='norm2', max_iter=int(1e4), tol=1e-8, R=None):
31 |     '''
32 |     A,S = nnmf(X, r, cost='norm2', tol=1e-8, R=None)
33 | 
34 |     Implement Lee & Seung's algorithm
35 | 
36 |     Parameters
37 |     ----------
38 |     V : 2-ndarray
39 |         input matrix
40 |     r : integer
41 |         nr of latent features
42 |     cost : one of:
43 |         'norm2' : minimise || X - AS ||_2 (default)
44 |         'i-div' : minimise D(X||AS), where D is I-divergence (generalisation of K-L divergence)
45 |     max_iter : integer, optional
46 |         maximum number of iterations (default: 10000)
47 |     tol : double
48 |         tolerance threshold for early exit (when the update factor is with tol
49 |         of 1., the function exits)
50 |     R : integer, optional
51 |         random seed
52 | 
53 |     Returns
54 |     -------
55 |     A : 2-ndarray
56 |     S : 2-ndarray
57 | 
58 |     Reference
59 |     ---------
60 |     "Algorithms for Non-negative Matrix Factorization"
61 |     by Daniel D Lee, Sebastian H Seung
62 |     (available at http://citeseer.ist.psu.edu/lee01algorithms.html)
63 |     '''
64 |     # Nomenclature in the function follows lee & seung, while outside nomenclature follows 
65 |     eps = 1e-8
66 |     n,m = V.shape
67 |     R = get_nprandom(R)
68 |     W = R.standard_normal((n,r))**2
69 |     H = R.standard_normal((r,m))**2
70 |     for i in range(max_iter):
71 |         if cost == 'norm2':
72 |             updateH = dot(W.T,V)/(dot(dot(W.T,W),H)+eps)
73 |             H *= updateH
74 |             updateW = dot(V,H.T)/(dot(W,dot(H,H.T))+eps)
75 |             W *= updateW
76 |         elif cost == 'i-div':
77 |             raise NotImplementedError('I-Div not implemented in lee_seung.nnmf')
78 |         if True or (i % 10) == 0:
79 |             max_update = max(updateW.max(),updateH.max())
80 |             if abs(1.-max_update) < tol:
81 |                 break
82 |     return W,H
83 | 
84 | 


--------------------------------------------------------------------------------
/milk/unsupervised/normalise.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2008-2013, Luis Pedro Coelho <luis@luispedro.org>
  3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | #  of this software and associated documentation files (the "Software"), to deal
  7 | #  in the Software without restriction, including without limitation the rights
  8 | #  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | #  copies of the Software, and to permit persons to whom the Software is
 10 | #  furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | #  all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | #  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | #  THE SOFTWARE.
 22 | 
 23 | from __future__ import division
 24 | import numpy as np
 25 | __all__ = [
 26 |     'center',
 27 |     'zscore',
 28 |     ]
 29 | def _nanmean(arr, axis=None):
 30 |     nancounts = np.sum(~np.isnan(arr), axis=axis)
 31 |     return np.nansum(arr,axis=axis)/nancounts
 32 | def _nanstd(arr, axis=None):
 33 |     if axis == 1:
 34 |         return _nanstd(arr.T, axis=0)
 35 |     mu = _nanmean(arr,axis=axis)
 36 |     return np.sqrt(_nanmean((arr-mu)**2, axis=axis))
 37 | 
 38 | 
 39 | def zscore(features, axis=0, can_have_nans=True, inplace=False):
 40 |     """
 41 |     features = zscore(features, axis=0, can_have_nans=True, inplace=False)
 42 | 
 43 |     Returns a copy of features which has been normalised to zscores
 44 | 
 45 |     Parameters
 46 |     ----------
 47 |     features : ndarray
 48 |         2-D input array
 49 |     axis : integer, optional
 50 |         which axis to normalise (default: 0)
 51 |     can_have_nans : boolean, optional
 52 |         whether ``features`` is allowed to have NaNs (default: True)
 53 |     inplace : boolean, optional
 54 |         Whether to operate inline (i.e., potentially change the input array).
 55 |         Default is False
 56 | 
 57 |     Returns
 58 |     -------
 59 |     features : ndarray
 60 |         zscored version of features
 61 |     """
 62 |     if not inplace:
 63 |         features = features.copy()
 64 |     else:
 65 |         features = np.asarray(features)
 66 |     if features.ndim != 2:
 67 |         raise ValueError('milk.unsupervised.zscore: Can only handle 2-D arrays')
 68 |     if can_have_nans:
 69 |         mu = _nanmean(features, axis)
 70 |         sigma = _nanstd(features, axis)
 71 |     else:
 72 |         mu = features.mean(axis)
 73 |         sigma = np.std(features, axis)
 74 |     sigma[sigma == 0] = 1.
 75 |     if axis == 0:
 76 |         features -= mu
 77 |         features /= sigma
 78 |     elif axis == 1:
 79 |         features -= mu[:,None]
 80 |         features /= sigma[:,None]
 81 |     return features
 82 | 
 83 | 
 84 | 
 85 | def center(features, axis=0, can_have_nans=True, inplace=False):
 86 |     '''
 87 |     centered, mean = center(features, axis=0, inplace=False)
 88 | 
 89 |     Center data
 90 | 
 91 |     Parameters
 92 |     ----------
 93 |     features : ndarray
 94 |         2-D input array
 95 |     axis : integer, optional
 96 |         which axis to normalise (default: 0)
 97 |     can_have_nans : boolean, optional
 98 |         whether ``features`` is allowed to have NaNs (default: True)
 99 |     inplace : boolean, optional
100 |         Whether to operate inline (i.e., potentially change the input array).
101 |         Default is False
102 | 
103 |     Returns
104 |     -------
105 |     features : ndarray
106 |         centered version of features
107 |     mean : ndarray
108 |         mean values
109 |     '''
110 |     if can_have_nans:
111 |         meanfunction = _nanmean
112 |     else:
113 |         meanfunction = np.mean
114 |     features = np.array(features, copy=(not inplace), dtype=float)
115 |     mean = meanfunction(features, axis=axis)
116 |     if axis == 0:
117 |         features -= mean
118 |     elif axis == 1:
119 |         features -= mean[:,None]
120 |     else:
121 |         raise ValueError('milk.unsupervised.center: axis ∉ { 0, 1}')
122 |     return features, mean
123 | 
124 | 


--------------------------------------------------------------------------------
/milk/unsupervised/parzen.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2010, Luis Pedro Coelho <luis@luispedro.org>
 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | #  of this software and associated documentation files (the "Software"), to deal
 7 | #  in the Software without restriction, including without limitation the rights
 8 | #  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | #  copies of the Software, and to permit persons to whom the Software is
10 | #  furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | #  all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | #  THE SOFTWARE.
22 | 
23 | from __future__ import division
24 | import numpy as np
25 | 
26 | def get_parzen_1class_rbf_loocv(features):
27 |     '''
28 |     f,fprime = get_parzen_1class_rbf_loocv(features)
29 | 
30 |     Leave-one-out crossvalidation value for 1-class Parzen window evaluator for
31 |     features.
32 | 
33 |     Parameters
34 |     ----------
35 |     features : ndarray
36 |         feature matrix
37 | 
38 |     Returns
39 |     -------
40 |     f : function: double -> double
41 |         function which evaluates the value of a window value. Minize to get the
42 |         best window value.
43 |     fprime : function: double -> double
44 |         function: df/dh
45 |     '''
46 |     from milk.unsupervised.pdist import pdist
47 |     D2 = -pdist(features)
48 |     n = len(features)
49 |     sumD2 = D2.sum()
50 |     D2.flat[::(n+1)] = -np.inf
51 |     def f(h):
52 |         D2h = D2 / (2.*h)
53 |         np.exp(D2h, D2h)
54 |         val = D2h.sum()
55 |         return val/np.sqrt(2*h*np.pi)
56 |     def fprime(h):
57 |         D2h = D2 / (2.*h)
58 |         D2h.flat[::(n+1)] = 1.
59 |         D2h *= np.exp(D2h)
60 |         val = D2h.sum() - D2h.trace()
61 |         val /= np.sqrt(2*h*np.pi)
62 |         return -1./(4*np.pi*h)*f(h) + val
63 |     return f,fprime
64 | 
65 | def parzen(features, h):
66 |     '''
67 |     f = parzen(features, h)
68 | 
69 |     Parzen window smoothing
70 | 
71 |     Parameters
72 |     ----------
73 |     features : ndarray
74 |         feature matrix
75 |     h : double
76 |         bandwidth
77 | 
78 |     Returns
79 |     -------
80 |     f : callable (double^N -> double)
81 |         density function
82 |     '''
83 |     sum2 = np.array([np.dot(f,f) for f in features])
84 |     N = len(features)
85 |     beta = np.sqrt(2*h*np.pi)/N
86 |     def f(x):
87 |         dist = np.dot(features, -2*x)
88 |         dist += sum2
89 |         dist += np.dot(c,c)
90 |         dist /= 2.*h
91 |         np.exp(dist, dist)
92 |         val = dist.sum()
93 |         return val*beta
94 |     return f
95 | 
96 | 


--------------------------------------------------------------------------------
/milk/unsupervised/pca.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2008-2013, Luis Pedro Coelho <luis@luispedro.org>
  3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
  4 | #
  5 | # License: MIT. See COPYING.MIT file in the milk distribution
  6 | 
  7 | from __future__ import division
  8 | import numpy as np
  9 | from numpy import linalg
 10 | from . import normalise
 11 | from .pdist import pdist
 12 | 
 13 | __all__ = [
 14 |     'pca',
 15 |     'mds',
 16 |     'mds_dists',
 17 |     ]
 18 | 
 19 | def pca(X, zscore=True):
 20 |     '''
 21 |     Y,V = pca(X, zscore=True)
 22 | 
 23 |     Principal Component Analysis
 24 | 
 25 |     Performs principal component analysis. Returns transformed
 26 |     matrix and principal components
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     X : 2-dimensional ndarray
 31 |         data matrix
 32 |     zscore : boolean, optional
 33 |         whether to normalise to zscores (default: True)
 34 | 
 35 |     Returns
 36 |     -------
 37 |     Y : ndarray
 38 |         Transformed matrix (of same dimension as X)
 39 |     V : ndarray
 40 |         principal components
 41 |     '''
 42 |     if zscore:
 43 |         X = normalise.zscore(X)
 44 |     C = np.cov(X.T)
 45 |     w,v = linalg.eig(C)
 46 |     Y = np.dot(v,X.T).T
 47 |     return Y,v
 48 | 
 49 | 
 50 | def mds(features, ndims, zscore=False):
 51 |     '''
 52 |     X = mds(features, ndims, zscore=False)
 53 | 
 54 |     Euclidean Multi-dimensional Scaling
 55 | 
 56 |     Parameters
 57 |     ----------
 58 |     features : ndarray
 59 |         data matrix
 60 |     ndims : int
 61 |         Number of dimensions to return
 62 |     zscore : boolean, optional
 63 |         Whether to zscore the features (default: False)
 64 | 
 65 |     Returns
 66 |     -------
 67 |     X : ndarray
 68 |         array of size ``(m, ndims)`` where ``m = len(features)``
 69 | 
 70 |     See Also
 71 |     --------
 72 |     mds_dists : function
 73 |     '''
 74 |     if zscore:
 75 |         features = normalise.zscore(features)
 76 |     else:
 77 |         features = np.asarray(features)
 78 |     P2 = pdist(features)
 79 |     return mds_dists(P2, ndims)
 80 | 
 81 | def mds_dists(distances, ndims):
 82 |     '''
 83 |     X = mds_dists(distances, ndims)
 84 | 
 85 |     Euclidean Multi-dimensional Scaling based on a distance matrix
 86 | 
 87 |     Parameters
 88 |     ----------
 89 |     distances : ndarray
 90 |         data matrix
 91 |     ndims : int
 92 |         Number of dimensions to return
 93 | 
 94 |     Returns
 95 |     -------
 96 |     X : ndarray
 97 |         array of size ``(m, ndims)`` where ``m = len(features)``
 98 | 
 99 |     See Also
100 |     --------
101 |     mds : function
102 |     '''
103 | 
104 |     n = len(distances)
105 |     J = np.eye(n) - (1./n)* np.ones((n,n))
106 |     B = -.5 * np.dot(J,np.dot(distances,J))
107 |     w,v = np.linalg.eig(B)
108 |     worder = w.argsort()
109 |     worder = worder[::-1]
110 |     w = w[worder]
111 |     v = v[:,worder]
112 | 
113 | 
114 |     w = w[:ndims]
115 |     s = np.sign(w)
116 |     w = np.abs(w).real
117 |     w = np.diag(np.sqrt(s * w))
118 |     X = np.dot(v[:,:ndims], w)
119 |     return X.real
120 | 
121 | 


--------------------------------------------------------------------------------
/milk/unsupervised/pdist.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2008-2012, Luis Pedro Coelho <luis@luispedro.org>
 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 4 | #
 5 | # License: MIT. See COPYING.MIT file in the milk distribution
 6 | 
 7 | from __future__ import division
 8 | import numpy as np
 9 | 
10 | __all__ = [
11 |     'pdist',
12 |     'plike',
13 |     ]
14 | 
15 | def pdist(X, Y=None, distance='euclidean2'):
16 |     '''
17 |     D = pdist(X, Y={X}, distance='euclidean2')
18 | 
19 |     Compute distance matrix::
20 | 
21 |     D[i,j] == np.sum( (X[i] - Y[j])**2 )
22 | 
23 |     Parameters
24 |     ----------
25 |       X : feature matrix
26 |       Y : feature matrix (default: use `X`)
27 |       distance : one of 'euclidean' or 'euclidean2' (default)
28 | 
29 |     Returns
30 |     -------
31 |       D : matrix of doubles
32 |     '''
33 |     # Use Dij = np.dot(Xi, Xi) + np.dot(Xj,Xj) - 2.*np.dot(Xi,Xj)
34 |     if Y is None:
35 |         D = np.dot(X, X.T)
36 |         x2 = D.diagonal()
37 |         x2 = x2.copy()
38 |         y2 = x2
39 |     else:
40 |         D = np.dot(X, Y.T)
41 |         x2 = np.array([np.dot(x,x) for x in X])
42 |         y2 = np.array([np.dot(y,y) for y in Y])
43 |     D *= -2.
44 |     D += x2[:,np.newaxis]
45 |     D += y2
46 | 
47 |     # Because of numerical imprecision, we might get negative numbers
48 |     # (which cause problems down the road, e.g., when doing the sqrt):
49 |     np.maximum(D, 0, D)
50 |     if distance == 'euclidean':
51 |         np.sqrt(D, D)
52 |     return D
53 | 
54 | 
55 | def plike(X, sigma2=None):
56 |     '''
57 |     L = plike(X, sigma2={guess based on X})
58 | 
59 |     Compute likelihood that any two objects come from the same distribution
60 |     under a Gaussian distribution hypothesis::
61 | 
62 |         L[i,j] = exp( ||X[i] - X[j]||^2 / sigma2 )
63 | 
64 |     Parameters
65 |     ----------
66 |     X : ndarray
67 |         feature matrix
68 |     sigma2 : float, optional
69 |         bandwidth
70 | 
71 |     Returns
72 |     -------
73 |     L : ndarray
74 |         likelihood matrix
75 | 
76 |     See Also
77 |     --------
78 |     pdist : function
79 |         Compute distances between objects
80 |     '''
81 | 
82 |     L = pdist(X)
83 |     if sigma2 is None:
84 |         sigma2 = np.median(L)
85 |     L /= -sigma2
86 |     np.exp(L, L)
87 |     return L
88 | 


--------------------------------------------------------------------------------
/milk/unsupervised/som.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2010, Luis Pedro Coelho <luis@luispedro.org>
  3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
  4 | # License: MIT. See COPYING.MIT file in the milk distribution
  5 | from __future__ import division
  6 | 
  7 | import numpy as np
  8 | 
  9 | from ..utils import get_pyrandom
 10 | from . import _som
 11 | 
 12 | def putpoints(grid, points, L=.2, radius=4, iterations=1, shuffle=True, R=None):
 13 |     '''
 14 |     putpoints(grid, points, L=.2, radius=4, iterations=1, shuffle=True, R=None)
 15 | 
 16 |     Feeds elements of `points` into the SOM `grid`
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     grid : ndarray
 21 |         Self organising map
 22 |     points : ndarray
 23 |         data to feed to array
 24 |     L : float, optional
 25 |         How much to influence neighbouring points (default: .2)
 26 |     radius : integer, optional
 27 |         Maximum radius of influence (in L_1 distance, default: 4)
 28 |     iterations : integer, optional
 29 |         Number of iterations
 30 |     shuffle : boolean, optional
 31 |         Whether to shuffle the points before each iterations
 32 |     R : source of randomness
 33 |     '''
 34 |     if radius is None:
 35 |         radius = 4
 36 |     if type(L) != float:
 37 |         raise TypeError("milk.unsupervised.som: L should be floating point")
 38 |     if type(radius) != int:
 39 |         raise TypeError("milk.unsupervised.som: radius should be an integer")
 40 |     if grid.dtype != np.float32:
 41 |         raise TypeError('milk.unsupervised.som: only float32 arrays are accepted')
 42 |     if points.dtype != np.float32:
 43 |         raise TypeError('milk.unsupervised.som: only float32 arrays are accepted')
 44 |     if len(grid.shape) == 2:
 45 |         grid = grid.reshape(grid.shape+(1,))
 46 |     if shuffle:
 47 |         random = get_pyrandom(R)
 48 |     for i in range(iterations):
 49 |         if shuffle:
 50 |             random.shuffle(points)
 51 |         _som.putpoints(grid, points, L, radius)
 52 | 
 53 | def closest(grid, f):
 54 |     '''
 55 |     y,x = closest(grid, f)
 56 | 
 57 |     Finds the coordinates of the closest point in the `grid` to `f`
 58 | 
 59 |     ::
 60 | 
 61 |         y,x = \\argmin_{y,x} { || grid[y,x] - f ||^2 }
 62 | 
 63 |     Parameters
 64 |     ----------
 65 |     grid : ndarray of shape Y,X,J
 66 |         self-organised map
 67 |     f : ndarray of shape J
 68 |         point
 69 | 
 70 |     Returns
 71 |     -------
 72 |     y,x : integers
 73 |         coordinates into `grid`
 74 |     '''
 75 |     delta = grid - f
 76 |     delta **= 2
 77 |     delta = delta.sum(2)
 78 |     return np.unravel_index(delta.argmin(), delta.shape)
 79 | 
 80 | 
 81 | def som(data, shape, iterations=1000, L=.2, radius=4, R=None):
 82 |     '''
 83 |     grid = som(data, shape, iterations=1000, L=.2, radius=4, R=None):
 84 | 
 85 |     Self-organising maps
 86 | 
 87 |     Parameters
 88 |     ----------
 89 |     points : ndarray
 90 |         data to feed to array
 91 |     shape : tuple
 92 |         Desired shape of output. Must be 2-dimensional.
 93 |     L : float, optional
 94 |         How much to influence neighbouring points (default: .2)
 95 |     radius : integer, optional
 96 |         Maximum radius of influence (in L_1 distance, default: 4)
 97 |     iterations : integer, optional
 98 |         Number of iterations
 99 |     R : source of randomness
100 | 
101 |     Returns
102 |     -------
103 |     grid : ndarray
104 |         Map
105 |     '''
106 |     R = get_pyrandom(R)
107 |     d = data.shape[1]
108 |     if data.dtype != np.float32:
109 |         data = data.astype(np.float32)
110 |     grid = np.array(R.sample(list(data), np.product(shape))).reshape(shape + (d,))
111 |     putpoints(grid, data, L=L, radius=radius, iterations=iterations, shuffle=True, R=R)
112 |     return grid
113 | 


--------------------------------------------------------------------------------
/milk/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *
2 | 


--------------------------------------------------------------------------------
/milk/utils/parallel.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2011-2012, Luis Pedro Coelho <luis@luispedro.org>
  3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
  4 | # License: MIT. See COPYING.MIT file in the milk distribution
  5 | 
  6 | from __future__ import division, with_statement
  7 | import multiprocessing
  8 | 
  9 | max_procs = 1
 10 | _used_procs = multiprocessing.Value('i', 1)
 11 | _plock = multiprocessing.Lock()
 12 | 
 13 | def set_max_processors(value=None):
 14 |     '''
 15 |     set_max_processors(value=None)
 16 | 
 17 |     Set the maximum number of processors to ``value`` (or to the number of
 18 |     physical CPUs if ``None``).
 19 | 
 20 |     Note that this is valid for the current process and its children, but not
 21 |     the parent.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     value : int, optional
 26 |         Number of processors to use. Defaults to number of CPUs (as returned by
 27 |         ``multiprocessing.cpu_count()``).
 28 |     '''
 29 |     global max_procs
 30 |     if value is None:
 31 |         value = multiprocessing.cpu_count()
 32 |     max_procs = value
 33 | 
 34 | def get_proc():
 35 |     '''
 36 |     available = get_proc()
 37 | 
 38 |     Reserve a processor
 39 | 
 40 |     Returns
 41 |     -------
 42 |     available : bool
 43 |         True if a processor is available
 44 |     '''
 45 |     with _plock:
 46 |         if _used_procs.value >= max_procs:
 47 |             return False
 48 |         _used_procs.value += 1
 49 |         return True
 50 | 
 51 | def release_proc():
 52 |     '''
 53 |     release_proc()
 54 | 
 55 |     Returns a processor to the pool
 56 |     '''
 57 |     with _plock:
 58 |         _used_procs.value -= 1
 59 | 
 60 | def release_procs(n, count_current=True):
 61 |     '''
 62 |     release_procs(n, count_current=True)
 63 | 
 64 |     Returns ``n`` processors to the pool
 65 | 
 66 |     Parameters
 67 |     ----------
 68 |     n : int
 69 |         Number of processors to release
 70 |     count_current : bool, optional
 71 |         Whether the current processor is to be included in ``n`` (default: True)
 72 |     '''
 73 |     if count_current:
 74 |         n -= 1
 75 |     if n > 0:
 76 |         with _plock:
 77 |             _used_procs.value -= n
 78 | 
 79 | def get_procs(desired=None, use_current=True):
 80 |     '''
 81 |     n = get_procs(desired=None, use_current=True)
 82 | 
 83 |     Get the up to ``desired`` processors (use None for no maximum).
 84 | 
 85 |     Parameters
 86 |     ----------
 87 |     desired : int, optional
 88 |         Number of processors you wish. By default, there is no maximum
 89 |     use_current: bool, optional
 90 |         Whether to count the current processor, True by default.
 91 |     '''
 92 |     if desired is None:
 93 |         desired = 1024 # This should last a few years
 94 |     n = (1 if use_current else 0)
 95 |     while n < desired:
 96 |         if get_proc():
 97 |             n += 1
 98 |         else:
 99 |             return n
100 |     return n
101 | 
102 | 


--------------------------------------------------------------------------------
/milk/utils/utils.h:
--------------------------------------------------------------------------------
 1 | extern "C" {
 2 |     #include <Python.h>
 3 |     #include <numpy/ndarrayobject.h>
 4 | }
 5 | #if PY_MAJOR_VERSION < 3
 6 | 
 7 | #define DECLARE_MODULE(name) \
 8 | extern "C" \
 9 | void init##name () { \
10 |     import_array(); \
11 |     (void)Py_InitModule(#name, methods); \
12 | }
13 | 
14 | #else
15 | 
16 | #define DECLARE_MODULE(name) \
17 | namespace { \
18 |     struct PyModuleDef moduledef = { \
19 |         PyModuleDef_HEAD_INIT, \
20 |         #name, \
21 |         NULL, \
22 |         -1, \
23 |         methods, \
24 |         NULL, \
25 |         NULL, \
26 |         NULL, \
27 |         NULL \
28 |     }; \
29 | } \
30 | PyMODINIT_FUNC \
31 | PyInit_##name () { \
32 |     import_array(); \
33 |     PyModule_Create(&moduledef); \
34 | }
35 | 
36 | #endif
37 | 
38 | 


--------------------------------------------------------------------------------
/milk/utils/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2008-2012, Luis Pedro Coelho <luis@luispedro.org>
 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 4 | # 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | #  of this software and associated documentation files (the "Software"), to deal
 7 | #  in the Software without restriction, including without limitation the rights
 8 | #  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | #  copies of the Software, and to permit persons to whom the Software is
10 | #  furnished to do so, subject to the following conditions:
11 | # 
12 | # The above copyright notice and this permission notice shall be included in
13 | #  all copies or substantial portions of the Software.
14 | # 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | #  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | #  THE SOFTWARE.
22 | 
23 | 
24 | import numpy as np
25 | import random
26 | 
27 | __all__ = [
28 |     'get_nprandom',
29 |     'get_pyrandom',
30 |     ]
31 | 
32 | def get_nprandom(R):
33 |     '''
34 |     R' = get_nprandom(R)
35 | 
36 |     Returns a numpy.RandomState from R
37 | 
38 |     Parameters
39 |     ----------
40 |     R : can be one of:
41 |         None          : Returns the default numpy global state
42 |         integer       : Uses it as a seed for constructing a new random generator
43 |         RandomState   : returns R
44 | 
45 |     Returns
46 |     -------
47 |     R' : np.RandomState
48 |     '''
49 |     if R is None:
50 |         return np.random.mtrand._rand
51 |     if type(R) == int:
52 |         return np.random.RandomState(R)
53 |     if type(R) is random.Random:
54 |         return np.random.RandomState(R.randint(0, 2**30))
55 |     if type(R) is np.random.RandomState:
56 |         return R
57 |     raise TypeError("get_nprandom() does not know how to handle type {0}.".format(type(R)))
58 | 
59 | def get_pyrandom(R):
60 |     '''
61 |     R = get_pyrandom(R)
62 | 
63 |     Returns a random.Random object based on R
64 | 
65 |     Parameters
66 |     ----------
67 |     R : can be one of:
68 |         None          : Returns the default numpy global state
69 |         integer       : Uses it as a seed for constructing a new random generator
70 |         RandomState   : returns R
71 | 
72 |     Returns
73 |     -------
74 |     R' : random.Random
75 |     '''
76 |     if R is None:
77 |         return random.seed.__self__
78 |     if type(R) is int:
79 |         return random.Random(R)
80 |     if type(R) is np.random.RandomState:
81 |         return random.Random(R.randint(2**30))
82 |     if type(R) is random.Random:
83 |         return R
84 |     raise TypeError("get_pyrandom() does not know how to handle type {0}.".format(type(R)))
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/milk/wrapper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/milk/abc2a28b526c199414d42c0a26092938968c3caf/milk/wrapper/__init__.py


--------------------------------------------------------------------------------
/milk/wrapper/wraplibsvm.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2008-2010, Luis Pedro Coelho <luis@luispedro.org>
 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 4 | # License: MIT. See COPYING.MIT file in the milk distribution
 5 | 
 6 | from __future__ import division
 7 | from milk.supervised.classifier import normaliselabels
 8 | try:
 9 |     from libsvm import svm as libsvm
10 | except ImportError:
11 |     try:
12 |         import svm as libsvm
13 |     except ImportError:
14 |         libsvm = None
15 | from tempfile import NamedTemporaryFile
16 | 
17 | class libsvmModel(object):
18 |     def __init__(self, model, names, output_probability):
19 |         self.model = model
20 |         self.names = names
21 |         self.output_probability = output_probability
22 | 
23 |     def apply(self,feats):
24 |         if self.output_probability:
25 |             return self.model.predict_probability(feats)
26 |         res = self.model.predict(feats)
27 |         return self.names[int(res)]
28 | 
29 |     def __getstate__(self):
30 |         # This is really really really hacky, but it works
31 |         N = NamedTemporaryFile()
32 |         self.model.save(N.name)
33 |         S = N.read()
34 |         return S,self.output_probability,self.names
35 | 
36 |     def __setstate__(self,state):
37 |         if libsvm is None:
38 |             raise RuntimeError('LibSVM Library not found. Cannot use this classifier.')
39 |         S,self.output_probability,self.names = state
40 |         N = NamedTemporaryFile()
41 |         N.write(S)
42 |         N.flush()
43 |         self.model = libsvm.svm_model(N.name)
44 | 
45 | 
46 | class libsvmClassifier(object):
47 |     def __init__(self,probability = False, auto_weighting = True):
48 |         if libsvm is None:
49 |             raise RuntimeError('LibSVM Library not found. Cannot use this classifier.')
50 |         self.param = libsvm.svm_parameter(kernel_type = libsvm.RBF, probability = probability)
51 |         self.output_probability = probability
52 |         self.auto_weighting = auto_weighting
53 | 
54 |     def set_option(self,optname,value):
55 |         setattr(self.param, optname, value)
56 | 
57 |     def train(self, features, labels):
58 |         labels,names = normaliselabels(labels)
59 |         if self.auto_weighting:
60 |             nlabels = labels.max() + 1
61 |             self.param.nr_weight = int(nlabels)
62 |             self.param.weight_label = list(range(nlabels))
63 |             self.param.weight = [(labels != i).mean() for i in range(nlabels)]
64 |         problem = libsvm.svm_problem(labels.astype(float), features)
65 |         model = libsvm.svm_model(problem, self.param)
66 |         return libsvmModel(model, names, self.output_probability)
67 | 
68 | 


--------------------------------------------------------------------------------
/readthedocs-requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | numpydoc
3 | matplotlib
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2008-2014, Luis Pedro Coelho <luis@luispedro.org>
  3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
  4 | # 
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | #  of this software and associated documentation files (the "Software"), to deal
  7 | #  in the Software without restriction, including without limitation the rights
  8 | #  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | #  copies of the Software, and to permit persons to whom the Software is
 10 | #  furnished to do so, subject to the following conditions:
 11 | # 
 12 | # The above copyright notice and this permission notice shall be included in
 13 | #  all copies or substantial portions of the Software.
 14 | # 
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | #  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | #  THE SOFTWARE.
 22 | 
 23 | from __future__ import division
 24 | import os
 25 | import platform
 26 | 
 27 | try:
 28 |     import setuptools
 29 | except:
 30 |     print('''
 31 | setuptools not found.
 32 | 
 33 | On linux, the package is often called python-setuptools''')
 34 |     from sys import exit
 35 |     exit(1)
 36 | 
 37 | from numpy.distutils.core import setup, Extension
 38 | exec(compile(open('milk/milk_version.py').read(),
 39 |              'milk/milk_version.py', 'exec'))
 40 | long_description = open('README.rst').read()
 41 | undef_macros = []
 42 | define_macros = []
 43 | if os.environ.get('DEBUG'):
 44 |     undef_macros = ['NDEBUG']
 45 |     if os.environ.get('DEBUG') == '2':
 46 |         define_macros = [
 47 |                 ('_GLIBCXX_DEBUG','1'),
 48 |                 ('EIGEN_INTERNAL_DEBUGGING', '1'),
 49 |                 ]
 50 | 
 51 | _extensions = {
 52 |         'milk.unsupervised._kmeans' : ['milk/unsupervised/_kmeans.cpp'],
 53 |         'milk.unsupervised._som' : ['milk/unsupervised/_som.cpp'],
 54 | 
 55 |         'milk.supervised._svm' : ['milk/supervised/_svm.cpp'],
 56 |         'milk.supervised._tree' : ['milk/supervised/_tree.cpp'],
 57 |         'milk.supervised._perceptron' : ['milk/supervised/_perceptron.cpp'],
 58 |         'milk.supervised._lasso' : ['milk/supervised/_lasso.cpp'],
 59 | }
 60 | 
 61 | compiler_args = ['-std=c++0x']
 62 | if platform.system() == 'Darwin':
 63 |   compiler_args.append('-stdlib=libc++')
 64 | 
 65 | ext_modules = [
 66 |     Extension(key,
 67 |                 sources=sources,
 68 |                 undef_macros=undef_macros,
 69 |                 define_macros=define_macros,
 70 |                 extra_compile_args=compiler_args,
 71 |                 )
 72 |         for key,sources in _extensions.items()
 73 | ]
 74 | 
 75 | packages = [p for p in setuptools.find_packages()
 76 |                 if p.startswith('milk')]
 77 | 
 78 | package_dir = {
 79 |     'milk.tests': 'milk/tests',
 80 |     }
 81 | package_data = {
 82 |     'milk.tests': ['data/*'],
 83 |     }
 84 | 
 85 | setup(name = 'milk',
 86 |       version = __version__,
 87 |       description = 'Machine Learning Toolkit',
 88 |       long_description = long_description,
 89 |       author = u'Luis Pedro Coelho',
 90 |       author_email = 'luis@luispedro.org',
 91 |       url = 'http://luispedro.org/software/milk',
 92 |       license = 'MIT',
 93 |       packages = packages,
 94 |       package_dir = package_dir,
 95 |       package_data = package_data,
 96 |       ext_modules = ext_modules,
 97 |       test_suite = 'nose.collector',
 98 |       )
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/template.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (C) 2011, Luis Pedro Coelho <luis@luispedro.org>
3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
4 | # License: MIT. See COPYING.MIT file in the milk distribution
5 | 
6 | from __future__ import division
7 | import numpy as np
8 | 
9 | 


--------------------------------------------------------------------------------