├── __init__.py ├── README.md ├── LICENSE.md └── multiisotonic.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # multiisotonic 2 | An interface for multidimensional isotonic regression consistent with scikit-learn. 3 | 4 | In one dimension, points are completely ordered (i.e. every point is either greater than, less than, or equal to every other point). This case is handled by the sklearn.isotonic module. In the multidimensional case, a complete ordering of points is no longer generally possible, but points may still be partially ordered: If all of the coordinates of the first point are less than or equal to the coordinates of the second point, then the first point can be deemed less than or equal to the second. For example, in two dimensions, the points (1,3) and (2,2) are not ordered, but the point (1,3) is less than the point (2,4), which is less than the point (2,5). A multidimensional isotonic function is guaranteed to yield values that are nondecreasing when evaluated at a series of nondecreasing points; for example, f(1,3) <= f(2,4) <= f(2,5). 5 | 6 | A multidimensional isotonic regression takes a set of multidimensional points X, with corresponding values y, and returns an isotonic function with minimum squared distance from y. Algorithmically, this turns out to be mappable to a network flow problem (see [Picard 1976](http://dx.doi.org/10.1287/mnsc.22.11.1268) or [Spouge, Wan, and Wilbur 2003](http://dx.doi.org/10.1023/A:1023901806339)). This procedure is sensitive only to feature ranks, not values. 7 | 8 | This package requires [scikit-learn](https://github.com/scikit-learn/scikit-learn) and [python-igraph](https://github.com/igraph/python-igraph), and all sub-dependencies. 9 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Alexander P. Fields 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of multiisotonic nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /multiisotonic.py: -------------------------------------------------------------------------------- 1 | # Author: Alex Fields (github.com/alexfields) 2 | 3 | import numpy as np 4 | from scipy import sparse 5 | import igraph 6 | from sklearn.base import BaseEstimator, RegressorMixin 7 | from sklearn.utils.validation import NotFittedError, check_X_y, check_array 8 | 9 | 10 | class MultiIsotonicRegressor(BaseEstimator, RegressorMixin): 11 | """Regress a target value as a non-decreasing function of each input attribute, 12 | when the other attributes are non-decreasing 13 | 14 | min_partition_size is the minimum allowable size to which to partition the 15 | training set, to avoid overfitting 16 | """ 17 | def __init__(self, min_partition_size=1): 18 | self.min_partition_size = min_partition_size 19 | 20 | def fit(self, X, y): 21 | """Fit a multidimensional isotonic regression model 22 | 23 | Parameters 24 | ---------- 25 | X : array-like, shape=(n_samples, n_features) 26 | Training data 27 | 28 | y : array-like, shape=(n_samples,) 29 | Target values 30 | 31 | Returns 32 | ------- 33 | self : object 34 | Returns an instance of self 35 | """ 36 | X, y = check_X_y(X, y, y_numeric=True) # In principle, Infs would be OK, but better to complain and let the user handle it 37 | 38 | myorder = np.argsort(X[:, 0]) # order along the first axis to at least avoid some of the comparisons 39 | self._training_set = X[myorder, :] 40 | ysort = np.array(y, dtype=np.float64)[myorder] 41 | 42 | indices = [] 43 | indptr = [0] 44 | for (i, Xrow) in enumerate(self._training_set[:, 1:]): 45 | indices.append(np.flatnonzero((Xrow <= self._training_set[i+1:, 1:]).all(1))+i+1) 46 | indptr.append(indptr[-1]+len(indices[-1])) 47 | all_comparisons = sparse.csr_matrix((np.ones(indptr[-1], dtype=np.bool), np.concatenate(indices), indptr), 48 | shape=(X.shape[0], X.shape[0]), dtype=np.bool) 49 | edges_to_add = zip(*(all_comparisons-all_comparisons.dot(all_comparisons)).nonzero()) 50 | mygraph = igraph.Graph(n=y.size, edges=edges_to_add, directed=True, vertex_attrs={'y': ysort}) 51 | 52 | def _add_source_sink(graph_part): 53 | """Add in the edges connecting the source and sink vertices to the internal nodes of the graph""" 54 | y_part = np.array(graph_part.vs['y']) 55 | y_part -= y_part.mean() 56 | maxval = np.abs(y_part).sum()+1 57 | vsrc = graph_part.vcount() 58 | vsnk = vsrc+1 59 | graph_part.add_vertices(2) 60 | src_snk_edges = [(vsrc, curr_v) if curr_y > 0 else (curr_v, vsnk) for (curr_v, curr_y) in enumerate(y_part)] 61 | n_internal_edges = graph_part.ecount() 62 | graph_part.add_edges(src_snk_edges) 63 | graph_part.es['c'] = ([maxval]*n_internal_edges)+list(np.abs(y_part)) 64 | 65 | def _partition_graph(origV): 66 | """Recursively partition a subgraph (indexed by origV) according to the mincut algorithm 67 | 68 | Parameters 69 | ---------- 70 | origV : list-like 71 | A list of indices of mygraph corresponding to the subgraph to partition 72 | 73 | Returns 74 | ------- 75 | partition : list of lists 76 | A list of lists of indices indicating the final partitioning of the graph 77 | """ 78 | currgraph = mygraph.subgraph(origV) 79 | _add_source_sink(currgraph) 80 | currpart = currgraph.mincut(currgraph.vcount()-2, currgraph.vcount()-1, 'c').partition 81 | if len(currpart[0])-1 < self.min_partition_size or len(currpart[1])-1 < self.min_partition_size: 82 | # this partitioning would result in one of the sets being too small - so don't do it! 83 | return [origV] 84 | else: 85 | return _partition_graph([origV[idx] for idx in currpart[0][:-1]]) + _partition_graph([origV[idx] for idx in currpart[1][:-1]]) 86 | 87 | nodes_to_cover = y.size 88 | self._training_set_scores = np.empty(y.size) 89 | for part in _partition_graph(range(y.size)): 90 | self._training_set_scores[part] = ysort[part].mean() 91 | nodes_to_cover -= len(part) 92 | assert nodes_to_cover == 0 93 | 94 | return self 95 | 96 | def predict(self, X): 97 | """Predict according to the isotonic fit 98 | 99 | Parameters 100 | ---------- 101 | X : array-like, shape=(n_samples, n_features) 102 | 103 | Returns 104 | ------- 105 | C : array, shape=(n_samples,) 106 | Predicted values 107 | """ 108 | if not hasattr(self, '_training_set'): 109 | raise NotFittedError 110 | X = check_array(X) 111 | res = np.empty(X.shape[0]) 112 | minval = self._training_set_scores.min() # when the features are below the entire training set, set to the minimum training set value 113 | for (i, Xrow) in enumerate(X): 114 | lower_training_set = (self._training_set <= Xrow).all(1) 115 | if lower_training_set.any(): 116 | res[i] = self._training_set_scores[lower_training_set].max() 117 | else: 118 | res[i] = minval 119 | return res 120 | --------------------------------------------------------------------------------