├── .gitignore ├── LICENSE ├── README.md ├── forest_cluster ├── __init__.py ├── forest_embedding.py ├── k_medoids.py ├── similarity.pyx └── tests │ ├── __init__.py │ ├── fixtures.py │ └── test_forest_clustering.py ├── setup.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Joshua Loyal 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Random Forest Clustering 2 | Unsupervised Clustering using Random Forests 3 | -------------------------------------------------------------------------------- /forest_cluster/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | 5 | from forest_cluster.forest_embedding import RandomForestEmbedding 6 | from forest_cluster.k_medoids import KMedoids 7 | -------------------------------------------------------------------------------- /forest_cluster/forest_embedding.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | 5 | import numpy as np 6 | import scipy.sparse as sp 7 | from sklearn.ensemble.forest import BaseForest 8 | from sklearn.tree import DecisionTreeClassifier 9 | from sklearn.utils import check_random_state, check_array 10 | from sklearn.preprocessing import OneHotEncoder 11 | 12 | 13 | def bootstrap_sample_column(X, n_samples=None, random_state=1234): 14 | """bootstrap_sample_column 15 | 16 | Bootstrap sample the column of a dataset. 17 | 18 | Parameters 19 | ---------- 20 | X : np.ndarray (n_samples,) 21 | Column to bootstrap 22 | 23 | n_samples : int 24 | Number of samples to generate. If `None` then generate 25 | a bootstrap of size of `X`. 26 | 27 | random_state : int 28 | Seed to the random number generator. 29 | 30 | Returns 31 | ------- 32 | np.ndarray (n_samples,): 33 | The bootstrapped column. 34 | """ 35 | random_state = check_random_state(random_state) 36 | if n_samples is None: 37 | n_samples = X.shape[0] 38 | 39 | return random_state.choice(X, size=n_samples, replace=True) 40 | 41 | 42 | def uniform_sample_column(X, n_samples=None, random_state=1234): 43 | """uniform_sample_column 44 | 45 | Sample a column uniformly between its minimum and maximum value. 46 | 47 | Parameters 48 | ---------- 49 | X : np.ndarray (n_samples,) 50 | Column to sample. 51 | 52 | n_samples : int 53 | Number of samples to generate. If `None` then generate 54 | a bootstrap of size of `X`. 55 | 56 | random_state : int 57 | Seed to the random number generator. 58 | 59 | Returns 60 | ------- 61 | np.ndarray (n_samples,): 62 | Uniformly sampled column. 63 | """ 64 | random_state = check_random_state(random_state) 65 | if n_samples is None: 66 | n_samples = X.shape[0] 67 | 68 | min_X, max_X = np.min(X), np.max(X) 69 | return random_state.uniform(min_X, max_X, size=n_samples) 70 | 71 | 72 | def generate_synthetic_features(X, method='bootstrap', random_state=1234): 73 | """generate_synthetic_features 74 | 75 | Generate a synthetic dataset based on the empirical distribution 76 | of `X`. 77 | 78 | Parameters 79 | ---------- 80 | X : np.ndarray (n_samples, n_features) 81 | Dataset whose empirical distribution is used to generate the 82 | synthetic dataset. 83 | 84 | method : str {'bootstrap', 'uniform'} 85 | Method to use to generate the synthetic dataset. `bootstrap` 86 | samples each column with replacement. `uniform` generates 87 | a new column uniformly sampled between the minimum and 88 | maximum value of each column. 89 | 90 | random_state : int 91 | Seed to the random number generator. 92 | 93 | Returns 94 | ------- 95 | synth_X : np.ndarray (n_samples, n_features) 96 | The synthetic dataset. 97 | """ 98 | random_state = check_random_state(random_state) 99 | n_features = int(X.shape[1]) 100 | synth_X = np.empty_like(X) 101 | for column in xrange(n_features): 102 | if method == 'bootstrap': 103 | synth_X[:, column] = bootstrap_sample_column( 104 | X[:, column], random_state=random_state) 105 | elif method == 'uniform': 106 | synth_X[:, column] = uniform_sample_column( 107 | X[:, column], random_state=random_state) 108 | else: 109 | raise ValueError('method must be either `bootstrap` or `uniform`.') 110 | 111 | return synth_X 112 | 113 | 114 | def generate_discriminative_dataset(X, method='bootstrap', random_state=1234): 115 | """generate_discriminative_dataset. 116 | 117 | Generate a synthetic dataset based on the empirical distribution 118 | of `X`. A target column will be returned that is 0 if the row is 119 | from the real distribution, and 1 if the row is synthetic. The 120 | number of synthetic rows generated is equal to the number of rows 121 | in the original dataset. 122 | 123 | Parameters 124 | ---------- 125 | X : np.ndarray (n_samples, n_features) 126 | Dataset whose empirical distribution is used to generate the 127 | synthetic dataset. 128 | 129 | method : str {'bootstrap', 'uniform'} 130 | Method to use to generate the synthetic dataset. `bootstrap` 131 | samples each column with replacement. `uniform` generates 132 | a new column uniformly sampled between the minimum and 133 | maximum value of each column. 134 | 135 | random_state : int 136 | Seed to the random number generator. 137 | 138 | Returns 139 | ------- 140 | X_ : np.ndarray (2 * n_samples, n_features) 141 | Feature array for the synthetic dataset. The rows 142 | are randomly shuffled, so synthetic and actual samples should 143 | be intermixed. 144 | 145 | y_ : np.ndarray (2 * n_samples) 146 | Target column indicating whether the row is from the actual 147 | dataset (0) or synthetic (1). 148 | """ 149 | random_state = check_random_state(random_state) 150 | n_samples = int(X.shape[0]) 151 | 152 | synth_X = generate_synthetic_features( 153 | X, method=method, random_state=random_state) 154 | X_ = np.vstack((X, synth_X)) 155 | y_ = np.concatenate((np.ones(n_samples), np.zeros(n_samples))) 156 | 157 | permutation_indices = random_state.permutation(np.arange(X_.shape[0])) 158 | X_ = X_[permutation_indices, :] 159 | y_ = y_[permutation_indices] 160 | 161 | return X_, y_ 162 | 163 | 164 | class RandomForestEmbedding(BaseForest): 165 | """Very similar to sklearn's RandomTreesEmbedding; 166 | however, the forest is trained as a discriminator. 167 | """ 168 | def __init__(self, 169 | n_estimators=10, 170 | criterion='gini', 171 | max_depth=5, 172 | min_samples_split=2, 173 | min_samples_leaf=1, 174 | min_weight_fraction_leaf=0., 175 | max_features='auto', 176 | max_leaf_nodes=None, 177 | bootstrap=True, 178 | sparse_output=True, 179 | n_jobs=1, 180 | random_state=None, 181 | verbose=0, 182 | warm_start=False): 183 | super(RandomForestEmbedding, self).__init__( 184 | base_estimator=DecisionTreeClassifier(), 185 | n_estimators=n_estimators, 186 | estimator_params=("criterion", "max_depth", "min_samples_split", 187 | "min_samples_leaf", "min_weight_fraction_leaf", 188 | "max_features", "max_leaf_nodes", 189 | "random_state"), 190 | bootstrap=bootstrap, 191 | oob_score=False, 192 | n_jobs=n_jobs, 193 | random_state=random_state, 194 | verbose=verbose, 195 | warm_start=warm_start) 196 | 197 | self.criterion = criterion 198 | self.max_depth = max_depth 199 | self.min_samples_split = min_samples_split 200 | self.min_samples_leaf = min_samples_leaf 201 | self.min_weight_fraction_leaf = min_weight_fraction_leaf 202 | self.max_features = max_features 203 | self.max_leaf_nodes = max_leaf_nodes 204 | self.sparse_output = sparse_output 205 | 206 | def _set_oob_score(self, X, y): 207 | raise NotImplementedError("OOB score not supported in tree embedding") 208 | 209 | def fit(self, X, y=None, sample_weight=None): 210 | self.fit_transform(X, y, sample_weight=sample_weight) 211 | return self 212 | 213 | def fit_transform(self, X, y=None, sample_weight=None): 214 | X = check_array(X, accept_sparse=['csc'], ensure_2d=False) 215 | 216 | if sp.issparse(X): 217 | # Pre-sort indices to avoid that each individual tree of the 218 | # ensemble sorts the indices. 219 | X.sort_indices() 220 | 221 | X_, y_ = generate_discriminative_dataset(X) 222 | 223 | super(RandomForestEmbedding, self).fit(X_, y_, 224 | sample_weight=sample_weight) 225 | 226 | self.one_hot_encoder_ = OneHotEncoder(sparse=True) 227 | if self.sparse_output: 228 | return self.one_hot_encoder_.fit_transform(self.apply(X)) 229 | return self.apply(X) 230 | 231 | def transform(self, X): 232 | if self.sparse_output: 233 | return self.one_hot_encoder_.fit_transform(self.apply(X)) 234 | return self.apply(X) 235 | -------------------------------------------------------------------------------- /forest_cluster/k_medoids.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """K-medoids clustering""" 3 | 4 | # Authors: Timo Erkkilä 5 | # Antti Lehmussola 6 | # License: BSD 3 clause 7 | 8 | import numpy as np 9 | import warnings 10 | 11 | from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin 12 | from sklearn.metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS 13 | from sklearn.metrics.pairwise import pairwise_distances 14 | from sklearn.utils import check_array, check_random_state 15 | from sklearn.utils.validation import check_is_fitted 16 | 17 | 18 | class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin): 19 | """ 20 | k-medoids class. 21 | 22 | Parameters 23 | ---------- 24 | n_clusters : int, optional, default: 8 25 | How many medoids. Must be positive. 26 | 27 | distance_metric : string, optional, default: 'euclidean' 28 | What distance metric to use. 29 | 30 | clustering : {'pam'}, optional, default: 'pam' 31 | What clustering mode to use. 32 | 33 | init : {'random', 'heuristic'}, optional, default: 'heuristic' 34 | Specify medoid initialization. 35 | 36 | max_iter : int, optional, default : 300 37 | Specify the maximum number of iterations when fitting. 38 | 39 | random_state : int, optional, default: None 40 | Specify random state for the random number generator. 41 | """ 42 | 43 | # Supported clustering methods 44 | CLUSTERING_METHODS = ['pam'] 45 | 46 | # Supported initialization methods 47 | INIT_METHODS = ['random', 'heuristic'] 48 | 49 | def __init__(self, n_clusters=8, distance_metric='euclidean', 50 | clustering_method='pam', init='heuristic', 51 | max_iter=300, random_state=None): 52 | 53 | self.n_clusters = n_clusters 54 | 55 | self.distance_metric = distance_metric 56 | 57 | self.init = init 58 | 59 | self.max_iter = max_iter 60 | 61 | self.clustering_method = clustering_method 62 | 63 | self.random_state = random_state 64 | 65 | def _check_init_args(self): 66 | 67 | # Check n_clusters 68 | if self.n_clusters is None or self.n_clusters <= 0 or \ 69 | not isinstance(self.n_clusters, int): 70 | raise ValueError("n_clusters has to be nonnegative integer") 71 | 72 | # Check distance_metric 73 | if callable(self.distance_metric): 74 | self.distance_func = self.distance_metric 75 | elif self.distance_metric in PAIRWISE_DISTANCE_FUNCTIONS: 76 | self.distance_func = \ 77 | PAIRWISE_DISTANCE_FUNCTIONS[self.distance_metric] 78 | else: 79 | raise ValueError("distance_metric needs to be " + 80 | "callable or one of the " + 81 | "following strings: " + 82 | "{}".format(PAIRWISE_DISTANCE_FUNCTIONS.keys()) + 83 | ". Instead, '{}' ".format(self.distance_metric) + 84 | "was given.") 85 | 86 | # Check clustering_method 87 | if self.clustering_method not in self.CLUSTERING_METHODS: 88 | raise ValueError("clustering must be one of the following: " + 89 | "{}".format(self.CLUSTERING_METHODS)) 90 | 91 | # Check init 92 | if self.init not in self.INIT_METHODS: 93 | raise ValueError("init needs to be one of " + 94 | "the following: " + 95 | "{}".format(self.INIT_METHODS)) 96 | 97 | # Check random state 98 | self.random_state_ = check_random_state(self.random_state) 99 | 100 | def fit(self, X, y=None): 101 | """Fit K-Medoids to the provided data. 102 | 103 | Parameters 104 | ---------- 105 | X : array-like or sparse matrix, shape=(n_samples, n_features) 106 | 107 | Returns 108 | ------- 109 | self 110 | """ 111 | 112 | self._check_init_args() 113 | 114 | # Check that the array is good and attempt to convert it to 115 | # Numpy array if possible 116 | X = self._check_array(X) 117 | 118 | # Apply distance metric to get the distance matrix 119 | if self.distance_func: 120 | D = self.distance_func(X) 121 | else: 122 | D = X 123 | 124 | medoid_ics = self._get_initial_medoid_indices(D, self.n_clusters) 125 | 126 | # Old medoids will be stored here for reference 127 | old_medoid_ics = np.zeros((self.n_clusters,)) 128 | 129 | # Continue the algorithm as long as 130 | # the medoids keep changing and the maximum number 131 | # of iterations is not exceeded 132 | self.n_iter_ = 0 133 | while not np.all(old_medoid_ics == medoid_ics) and \ 134 | self.n_iter_ < self.max_iter: 135 | 136 | self.n_iter_ += 1 137 | 138 | # Keep a copy of the old medoid assignments 139 | old_medoid_ics = np.copy(medoid_ics) 140 | 141 | # Get cluster indices 142 | cluster_ics = self._get_cluster_ics(D, medoid_ics) 143 | 144 | # Update medoids with the new cluster indices 145 | self._update_medoid_ics_in_place(D, cluster_ics, medoid_ics) 146 | 147 | # Expose labels_ which are the assignments of 148 | # the training data to clusters 149 | self.labels_ = cluster_ics 150 | 151 | # Expose cluster centers, i.e. medoids 152 | self.cluster_centers_ = X.take(medoid_ics, axis=0) 153 | 154 | # Return self to enable method chaining 155 | return self 156 | 157 | def _check_array(self, X): 158 | 159 | X = check_array(X) 160 | 161 | # Check that the number of clusters is less than or equal to 162 | # the number of samples 163 | if self.n_clusters > X.shape[0]: 164 | raise ValueError("The number of medoids " + 165 | "({}) ".format(self.n_clusters) + 166 | "must be larger than the number " + 167 | "of samples ({})".format(X.shape[0])) 168 | 169 | return X 170 | 171 | def _get_cluster_ics(self, D, medoid_ics): 172 | """Returns cluster indices for D and current medoid indices""" 173 | 174 | # Assign data points to clusters based on 175 | # which cluster assignment yields 176 | # the smallest distance 177 | cluster_ics = np.argmin(D[medoid_ics, :], axis=0) 178 | 179 | return cluster_ics 180 | 181 | def _update_medoid_ics_in_place(self, D, cluster_ics, medoid_ics): 182 | """In-place update of the medoid indices""" 183 | 184 | # Update the medoids for each cluster 185 | for cluster_idx in range(self.n_clusters): 186 | 187 | if sum(cluster_ics == cluster_idx) == 0: 188 | warnings.warn("Cluster {} is empty!".format(cluster_idx)) 189 | continue 190 | 191 | # Find current cost that is associated with cluster_idx. 192 | # Cost is the sum of the distance from the cluster 193 | # members to the medoid. 194 | curr_cost = np.sum(D[medoid_ics[cluster_idx], 195 | cluster_ics == cluster_idx]) 196 | 197 | # Extract the distance matrix between the data points 198 | # inside the cluster_idx 199 | D_in = D[cluster_ics == cluster_idx, :] 200 | D_in = D_in[:, cluster_ics == cluster_idx] 201 | 202 | # Calculate all costs there exists between all 203 | # the data points in the cluster_idx 204 | all_costs = np.sum(D_in, axis=1) 205 | 206 | # Find the index for the smallest cost in cluster_idx 207 | min_cost_idx = np.argmin(all_costs) 208 | 209 | # find the value of the minimum cost in cluster_idx 210 | min_cost = all_costs[min_cost_idx] 211 | 212 | # If the minimum cost is smaller than that 213 | # exhibited by the currently used medoid, 214 | # we switch to using the new medoid in cluster_idx 215 | if min_cost < curr_cost: 216 | 217 | # Find data points that belong to cluster_idx, 218 | # and assign the newly found medoid as the medoid 219 | # for cluster c 220 | medoid_ics[cluster_idx] = \ 221 | np.where(cluster_ics == cluster_idx)[0][min_cost_idx] 222 | 223 | def transform(self, X): 224 | """Transforms X to cluster-distance space. 225 | 226 | Parameters 227 | ---------- 228 | X : array-like or sparse matrix, shape=(n_samples, n_features) 229 | Data to transform. 230 | 231 | Returns 232 | ------- 233 | X_new : array, shape=(n_samples, n_clusters) 234 | X transformed in the new space. 235 | """ 236 | 237 | check_is_fitted(self, "cluster_centers_") 238 | 239 | # Apply distance metric wrt. cluster centers (medoids), 240 | # and return these distances 241 | return self.distance_func(X, Y=self.cluster_centers_) 242 | 243 | def predict(self, X): 244 | 245 | check_is_fitted(self, "cluster_centers_") 246 | 247 | # Check that the array is good and attempt to convert it to 248 | # Numpy array if possible 249 | X = check_array(X) 250 | 251 | # Apply distance metric wrt. cluster centers (medoids) 252 | D = self.distance_func(X, Y=self.cluster_centers_) 253 | 254 | # Assign data points to clusters based on 255 | # which cluster assignment yields 256 | # the smallest distance 257 | labels = np.argmin(D, axis=1) 258 | 259 | return labels 260 | 261 | def inertia(self, X): 262 | 263 | # Map the original X to the distance-space 264 | Xt = self.transform(X) 265 | 266 | # Define inertia as the sum of the sample-distances 267 | # to closest cluster centers 268 | inertia = np.sum(np.min(Xt, axis=1)) 269 | 270 | return inertia 271 | 272 | def _get_initial_medoid_indices(self, D, n_clusters): 273 | 274 | if self.init == 'random': # Random initialization 275 | 276 | # Pick random k medoids as the initial ones. 277 | medoids = self.random_state_.permutation(D.shape[0])[:n_clusters] 278 | 279 | elif self.init == 'heuristic': # Initialization by heuristic 280 | 281 | # Pick K first data points that have the smallest sum distance 282 | # to every other point. These are the initial medoids. 283 | medoids = list(np.argsort(np.sum(D, axis=1))[:n_clusters]) 284 | 285 | else: 286 | 287 | raise ValueError("Initialization not implemented for method: " + 288 | "'{}'".format(self.init)) 289 | 290 | return medoids 291 | -------------------------------------------------------------------------------- /forest_cluster/similarity.pyx: -------------------------------------------------------------------------------- 1 | #cython: boundscheck=False 2 | #cython: cdivision=True 3 | #cython: wraparound=False 4 | 5 | import numpy as np 6 | cimport numpy as np 7 | np.import_array() 8 | 9 | ctypedef np.int_t DTYPE_t 10 | ctypedef np.intp_t ITYPE_t 11 | 12 | 13 | cdef inline double dist(DTYPE_t* x1, DTYPE_t* x2, 14 | ITYPE_t size) nogil except -1: 15 | cdef int n_eq = 0 16 | cdef ITYPE_t j 17 | for j in range(size): 18 | n_eq += (x1[j] == x2[j]) 19 | return n_eq * 1. / size 20 | 21 | 22 | def matching_dist(np.ndarray[DTYPE_t, ndim=1] X1, np.ndarray[DTYPE_t, ndim=1] X2): 23 | cdef ITYPE_t size = X1.shape[0] 24 | 25 | return dist( X1.data, X2.data, size) 26 | -------------------------------------------------------------------------------- /forest_cluster/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joshloyal/RandomForestClustering/f883eb5391befbd5e79080ebae979cc7b7e6f1b5/forest_cluster/tests/__init__.py -------------------------------------------------------------------------------- /forest_cluster/tests/fixtures.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | 4 | 5 | def generate_clustered_data(seed=0, n_clusters=3, n_features=2, 6 | n_samples_per_cluster=20, std=.4): 7 | prng = np.random.RandomState(seed) 8 | 9 | # the data is voluntary shifted away from zero to check clustering 10 | # algorithm robustness with regards to non centered data 11 | means = np.array([[1, 1, 1, 0], 12 | [-1, -1, 0, 1], 13 | [1, -1, 1, 1], 14 | [-1, 1, 1, 0], 15 | ]) + 10 16 | 17 | X = np.empty((0, n_features)) 18 | for i in range(n_clusters): 19 | X = np.r_[X, means[i][:n_features] 20 | + std * prng.randn(n_samples_per_cluster, n_features)] 21 | return X 22 | 23 | 24 | @pytest.fixture 25 | def simple_cluster(): 26 | return generate_clustered_data 27 | -------------------------------------------------------------------------------- /forest_cluster/tests/test_forest_clustering.py: -------------------------------------------------------------------------------- 1 | import forest_cluster as rfc 2 | from forest_cluster.tests.fixtures import simple_cluster 3 | 4 | 5 | def test_forest_clusterer(simple_cluster): 6 | X = simple_cluster() 7 | cluster = rfc.RandomForestClusterer() 8 | cluster.fit(X) 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | import contextlib 5 | 6 | from setuptools import Extension, setup 7 | import numpy 8 | 9 | 10 | PACKAGES = [ 11 | 'forest_cluster', 12 | 'forest_cluster.tests', 13 | ] 14 | 15 | CYTHON_MODS = [ 16 | 'forest_cluster.similarity' 17 | ] 18 | 19 | @contextlib.contextmanager 20 | def chdir(new_dir): 21 | old_dir = os.getcwd() 22 | try: 23 | sys.path.insert(0, new_dir) 24 | yield 25 | finally: 26 | del sys.path[0] 27 | os.chdir(old_dir) 28 | 29 | 30 | def clean(path): 31 | for name in CYTHON_MODS: 32 | name = name.replace('.', '/') 33 | for ext in ['.c', '.cpp', '.so']: 34 | file_path = os.path.join(path, name + ext) 35 | if os.path.exists(file_path): 36 | os.unlink(file_path) 37 | 38 | 39 | def get_python_package(root): 40 | return os.path.join(root, 'forest_cluster') 41 | 42 | 43 | def generate_sources(root): 44 | for base, _, files in os.walk(root): 45 | for filename in files: 46 | if filename.endswith('pyx'): 47 | yield os.path.join(base, filename) 48 | 49 | 50 | def generate_cython(root, cython_cov=False): 51 | print("Cythonizing sources") 52 | for source in generate_sources(get_python_package(root)): 53 | cythonize_source(source, cython_cov) 54 | 55 | 56 | def cythonize_source(source, cython_cov=False): 57 | print("Processing %s" % source) 58 | 59 | flags = ['--fast-fail'] 60 | if cython_cov: 61 | flags.extend(['--directive', 'linetrace=True']) 62 | flags.extend(['--directive', 'binding=True']) 63 | 64 | try: 65 | p = subprocess.call(['cython'] + flags + [source]) 66 | if p != 0: 67 | raise Exception('Cython failed') 68 | except OSError: 69 | raise OSError('Cython needs to be installed') 70 | 71 | 72 | def generate_extensions(root, macros=[]): 73 | ext_modules = [] 74 | for mod_name in CYTHON_MODS: 75 | mod_path = mod_name.replace('.', '/') + '.c' 76 | ext_modules.append( 77 | Extension(mod_name, 78 | sources=[mod_path], 79 | include_dirs=[numpy.get_include()], 80 | extra_compile_args=['-O3', '-fPIC'], 81 | define_macros=macros)) 82 | 83 | return ext_modules 84 | 85 | 86 | def setup_package(): 87 | root = os.path.abspath(os.path.dirname(__file__)) 88 | 89 | if len(sys.argv) > 1 and sys.argv[1] == 'clean': 90 | return clean(root) 91 | 92 | cython_cov = 'CYTHON_COV' in os.environ 93 | 94 | macros = [] 95 | if cython_cov: 96 | print("Adding coverage information to cythonized files.") 97 | macros = [('CYTHON_TRACE', 1)] 98 | 99 | with chdir(root): 100 | generate_cython(root, cython_cov) 101 | ext_modules = generate_extensions(root, macros=macros) 102 | setup( 103 | name="Random Forest Clustering", 104 | version='0.1.0', 105 | description='Unsupervised Clustering using Random Forests', 106 | author='Joshua D. Loyal', 107 | url='https://github.com/joshloyal/RandomForestClustering', 108 | license='MIT', 109 | install_requires=['numpy', 'scipy', 'scikit-learn', 'joblib'], 110 | packages=PACKAGES, 111 | ext_modules=ext_modules, 112 | ) 113 | 114 | 115 | if __name__ == '__main__': 116 | setup_package() 117 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | from sklearn import manifold 4 | from sklearn import ensemble 5 | from sklearn.decomposition import TruncatedSVD 6 | from sklearn.cluster import KMeans 7 | import forest_cluster as rfc 8 | from forest_cluster import KMedoids 9 | import numpy as np 10 | from mysuper.datasets import fetch_cars, fetch_10kdiabetes 11 | from scipy.spatial.distance import hamming 12 | import seaborn as sns 13 | import pandas as pd 14 | import time 15 | import scipy.sparse as sp 16 | 17 | 18 | """ 19 | Random Forest clustering works as follows 20 | 1. Construct a dissimilarity measure using RF 21 | 2. Use an embedding algorithm (MDS, TSNE) to embed into a 2D space preserving that dissimilarity measure. 22 | 3. Cluster using K-means or K-medoids 23 | """ 24 | 25 | # [1, 2, 3] == 1 , [1, 0, 0] 26 | # [1, 3, 1] == 1, [1, 0, 1] 27 | def fast_hamming_binary_dense(X): 28 | # does a conversion to dense.... 29 | n_features = X.shape[1] 30 | D = np.dot(1 - X, X.T) 31 | return (D + D.T) / X.shape[1] 32 | 33 | 34 | def fast_hamming_binary_sparse(X, n_matches=None): 35 | if n_matches: 36 | n_features = n_matches 37 | else: 38 | n_features = X.shape[1] 39 | H = (X * X.T).toarray() 40 | return 1 - H / n_features 41 | 42 | 43 | def fast_hamming_dense(X): 44 | unique_values = np.unique(X) 45 | U = sp.csr_matrix((X == unique_values[0]).astype(np.int32)) 46 | H = (U * U.transpose()).toarray() 47 | for unique_value in unique_values[1:]: 48 | U = sp.csr_matrix((X == unique_value).astype(np.int32)) 49 | H += (U * U.transpose()).toarray() 50 | return 1 - H.astype(np.float64) / X.shape[1] 51 | 52 | X = fetch_cars().values 53 | #X = fetch_10kdiabetes(one_hot=False).values 54 | 55 | n_trees = 5000 56 | print('tree embedding') 57 | t0 = time.time() 58 | rf = rfc.RandomForestEmbedding(n_estimators=n_trees, random_state=10, n_jobs=-1, sparse_output=False) 59 | leaves = rf.fit_transform(X) 60 | print('time: %r s', time.time() - t0) 61 | 62 | 63 | print('embedding data') 64 | t0 = time.time() 65 | 66 | #if leaves.shape[1] > 50: 67 | # projection = TruncatedSVD(n_components=50, random_state=123).fit_transform(leaves) 68 | #else: 69 | # projection = leaves.toarray() 70 | #dissimilarity = fast_hamming_binary_sparse(leaves, n_matches=n_trees) 71 | #projector = manifold.TSNE(random_state=1234, metric='precomputed') 72 | projector = manifold.TSNE(random_state=1234, metric='hamming') 73 | embedding = projector.fit_transform(leaves) 74 | 75 | 76 | #projector = manifold.MDS(random_state=1234, dissimilarity='precomputed') 77 | #embedding = projector.fit_transform(dissimilarity) 78 | print('time: %r s', time.time() - t0) 79 | 80 | 81 | print('clustering') 82 | t0 = time.time() 83 | clusterer = KMeans(n_clusters=4, random_state=1234, n_init=20, n_jobs=-1) 84 | clusterer.fit(embedding) 85 | 86 | #clusterer = KMedoids(n_clusters=3, random_state=1234, distance_metric='precomputed') 87 | #clusterer.fit(np.load('hamming.npy')) 88 | print('time: %r s', time.time() - t0) 89 | 90 | 91 | df = pd.DataFrame({'x': embedding[:, 0], 'y': embedding[:, 1], 'z': clusterer.labels_}) 92 | sns.lmplot('x', 'y', hue='z', data=df, fit_reg=False) 93 | --------------------------------------------------------------------------------